From 357adda8912905e1986cf4237f85f6d9982b3cd5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 15:19:38 -0500 Subject: [PATCH 001/182] Add URL Cache --- .../dae00e5aa8dd_create_rooturlcache.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py diff --git a/collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py b/collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py new file mode 100644 index 00000000..c95b10e0 --- /dev/null +++ b/collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py @@ -0,0 +1,34 @@ +"""Create RootURLCache + +Revision ID: dae00e5aa8dd +Revises: dcd158092de0 +Create Date: 2025-01-19 10:40:19.650982 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'dae00e5aa8dd' +down_revision: Union[str, None] = 'dcd158092de0' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table('root_url_cache', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('url', sa.String(), nullable=False), + sa.Column('page_title', sa.String(), nullable=False), + sa.Column('page_description', sa.String(), nullable=True), + sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url', name='root_url_cache_uq_url') + ) + + +def downgrade() -> None: + op.drop_table('root_url_cache') From 541ce853aad9c2a5fad572375e04bbba7241c2bb Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 15:28:43 -0500 Subject: [PATCH 002/182] Change transformers to use PyTorch --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c05cfbfa..7406d1ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,8 +33,7 @@ APScheduler~=3.11.0 alembic~=1.14.0 asyncpg~=0.30.0 pytest-asyncio~=0.25.2 -transformers~=4.40.2 -tf-keras~=2.18.0 +transformers[torch]~=4.40.2 # HTML Collector playwright~=1.49.1 From 9c604deb9c90697b2394f68c36c87a6bfdf6c524 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 16:08:11 -0500 Subject: [PATCH 003/182] Change transformers to use tf-keras --- Dockerfile | 2 ++ requirements.txt | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8e64b85d..1287b9d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,8 @@ COPY . . # Install dependencies RUN pip install --no-cache-dir -r requirements.txt +RUN playwright install +RUN playwright install-deps # Expose the application port EXPOSE 80 diff --git a/requirements.txt b/requirements.txt index 7406d1ef..c05cfbfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,8 @@ APScheduler~=3.11.0 alembic~=1.14.0 asyncpg~=0.30.0 pytest-asyncio~=0.25.2 -transformers[torch]~=4.40.2 +transformers~=4.40.2 +tf-keras~=2.18.0 # HTML Collector playwright~=1.49.1 From 4231c477919d87be48131111df8662d95e26fe8c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 17:46:37 -0500 Subject: [PATCH 004/182] Reduce size of Dockerfile --- .project-root | 0 Dockerfile | 21 +++++++++++++++++++-- core/SourceCollectorCore.py | 20 -------------------- util/miscellaneous_functions.py | 2 +- 4 files changed, 20 insertions(+), 23 deletions(-) create mode 100644 .project-root diff --git a/.project-root b/.project-root new file mode 100644 index 00000000..e69de29b diff --git a/Dockerfile b/Dockerfile index 1287b9d0..2d719b81 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,15 +5,32 @@ FROM python:3.12.8 # Set working directory WORKDIR /app -# Copy project files -COPY . . +COPY requirements.txt ./requirements.txt # Install dependencies RUN pip install --no-cache-dir -r requirements.txt RUN playwright install RUN playwright install-deps +# Copy project files +COPY agency_identifier ./agency_identifier +COPY api ./api +COPY collector_db ./collector_db +COPY collector_manager ./collector_manager +COPY core ./core +COPY html_tag_collector ./html_tag_collector +COPY hugging_face/url_relevance ./hugging_face/url_relevance +COPY hugging_face/HuggingFaceInterface.py ./hugging_face/HuggingFaceInterface.py +COPY source_collectors ./source_collectors +COPY util ./util +COPY alembic.ini ./alembic.ini +COPY apply_migrations.py ./apply_migrations.py +COPY security_manager ./security_manager +COPY execute.sh ./execute.sh +COPY .project-root ./.project-root + # Expose the application port EXPOSE 80 +COPY .env ./.env RUN chmod +x execute.sh \ No newline at end of file diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index b341bda3..cf4ad3a3 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -12,12 +12,9 @@ from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse -from core.DTOs.LabelStudioExportResponseInfo import LabelStudioExportResponseInfo from core.DTOs.MessageResponse import MessageResponse from core.ScheduledTaskManager import ScheduledTaskManager from core.enums import BatchStatus -from label_studio_interface.DTOs.LabelStudioTaskExportInfo import LabelStudioTaskExportInfo -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager class SourceCollectorCore: @@ -25,7 +22,6 @@ def __init__( self, core_logger: CoreLogger, db_client: DatabaseClient = DatabaseClient(), - label_studio_api_manager: LabelStudioAPIManager = LabelStudioAPIManager(), dev_mode: bool = False ): self.db_client = db_client @@ -38,7 +34,6 @@ def __init__( self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) else: self.scheduled_task_manager = None - self.label_studio_api_manager = label_studio_api_manager def get_batch_info(self, batch_id: int) -> BatchInfo: return self.db_client.get_batch_by_id(batch_id) @@ -98,21 +93,6 @@ def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse: logs = self.db_client.get_logs_by_batch_id(batch_id) return GetBatchLogsResponse(logs=logs) - def export_batch_to_label_studio(self, batch_id: int) -> LabelStudioExportResponseInfo: - # TODO: Might this need to be a separate thread? - db_url_infos = self.db_client.get_urls_by_batch(batch_id) - url_count = len(db_url_infos) - export_infos = [] - for url_info in db_url_infos: - export_infos.append(LabelStudioTaskExportInfo(url=url_info.url)) - import_id = self.label_studio_api_manager.export_tasks_into_project( - data=export_infos - ) - return LabelStudioExportResponseInfo( - label_studio_import_id=import_id, - num_urls_imported=url_count - ) - def abort_batch(self, batch_id: int) -> MessageResponse: self.collector_manager.abort_collector(cid=batch_id) return MessageResponse(message=f"Batch aborted.") diff --git a/util/miscellaneous_functions.py b/util/miscellaneous_functions.py index d27793ff..4b0bc88b 100644 --- a/util/miscellaneous_functions.py +++ b/util/miscellaneous_functions.py @@ -32,7 +32,7 @@ def get_project_root() -> Path: """ # Define the root markers that signify the root directory of the project - root_markers = ['.git'] # Add more markers as needed + root_markers = ['execute.sh'] # Add more markers as needed # Start from the current file's directory current_dir = Path(__file__).resolve().parent while current_dir != current_dir.parent: # Check if we've reached the root of the filesystem From bc68f287b2bcc579a935cb054762d6fdae361c59 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 17:51:34 -0500 Subject: [PATCH 005/182] Reduce size of Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2d719b81..96e4fe91 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,5 +32,5 @@ COPY .project-root ./.project-root # Expose the application port EXPOSE 80 -COPY .env ./.env +#COPY .env ./.env RUN chmod +x execute.sh \ No newline at end of file From f8826d86e791398df88c29c4615cb773acd51078 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 18:02:29 -0500 Subject: [PATCH 006/182] Set playwright to only install chromium --- Dockerfile | 6 ++++-- html_tag_collector/URLRequestInterface.py | 6 ++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 96e4fe91..b2949405 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ COPY requirements.txt ./requirements.txt # Install dependencies RUN pip install --no-cache-dir -r requirements.txt -RUN playwright install +RUN playwright install chromium RUN playwright install-deps # Copy project files @@ -32,5 +32,7 @@ COPY .project-root ./.project-root # Expose the application port EXPOSE 80 +RUN chmod +x execute.sh +# Use the below for ease of local development, but remove when pushing to GitHub +# Because there is no .env file in the repository (for security reasons) #COPY .env ./.env -RUN chmod +x execute.sh \ No newline at end of file diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index d6c8ace2..6c6756d0 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -31,10 +31,8 @@ class RequestResources: semaphore: asyncio.Semaphore = asyncio.Semaphore(MAX_CONCURRENCY) def ensure_browsers_installed(): - print("Installing browsers...") - result = subprocess.run("playwright install", shell=True, capture_output=True, text=True) - print(result.stdout) - print(result.stderr) + # TODO: Slated for destruction + pass HTML_CONTENT_TYPE = "text/html" From aabc499db594e1ea8ff8c02c8bd3182cf8b2e04d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 23 Jan 2025 18:16:54 -0500 Subject: [PATCH 007/182] Remove unused components --- source_collectors/muckrock/.gitignore | 1 - .../muckrock/classes/FOIADBSearcher.py | 65 ----- source_collectors/muckrock/constants.py | 1 - .../muckrock/create_foia_data_db.py | 260 ------------------ .../muckrock/get_allegheny_foias.py | 67 ----- .../muckrock/search_foia_data_db.py | 141 ---------- 6 files changed, 535 deletions(-) delete mode 100644 source_collectors/muckrock/classes/FOIADBSearcher.py delete mode 100644 source_collectors/muckrock/create_foia_data_db.py delete mode 100644 source_collectors/muckrock/get_allegheny_foias.py delete mode 100644 source_collectors/muckrock/search_foia_data_db.py diff --git a/source_collectors/muckrock/.gitignore b/source_collectors/muckrock/.gitignore index 3ad8c498..5047d9bc 100644 --- a/source_collectors/muckrock/.gitignore +++ b/source_collectors/muckrock/.gitignore @@ -226,4 +226,3 @@ flycheck_*.el *.json *.csv /csv -last_page_fetched.txt diff --git a/source_collectors/muckrock/classes/FOIADBSearcher.py b/source_collectors/muckrock/classes/FOIADBSearcher.py deleted file mode 100644 index 391f7a8d..00000000 --- a/source_collectors/muckrock/classes/FOIADBSearcher.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import sqlite3 - -import pandas as pd - -from source_collectors.muckrock.constants import FOIA_DATA_DB - -check_results_table_query = """ - SELECT name FROM sqlite_master - WHERE (type = 'table') - AND (name = 'results') - """ - -search_foia_query = """ - SELECT * FROM results - WHERE (title LIKE ? OR tags LIKE ?) - AND (status = 'done') - """ - - -class FOIADBSearcher: - - def __init__(self, db_path = FOIA_DATA_DB): - self.db_path = db_path - if not os.path.exists(self.db_path): - raise FileNotFoundError("foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.") - - - def search(self, search_string: str) -> pd.DataFrame | None: - """ - Searches the foia_data.db database for FOIA request entries matching the provided search string. - - Args: - search_string (str): The string to search for in the `title` and `tags` of the `results` table. - - Returns: - Union[pandas.DataFrame, None]: - - pandas.DataFrame: A DataFrame containing the matching entries from the database. - - None: If an error occurs during the database operation. - - Raises: - sqlite3.Error: If any database operation fails, prints error and returns None. - Exception: If any unexpected error occurs, prints error and returns None. - """ - try: - with sqlite3.connect(self.db_path) as conn: - results_table = pd.read_sql_query(check_results_table_query, conn) - if results_table.empty: - print("The `results` table does not exist in the database.") - return None - - df = pd.read_sql_query( - sql=search_foia_query, - con=conn, - params=[f"%{search_string}%", f"%{search_string}%"] - ) - - except sqlite3.Error as e: - print(f"Sqlite error: {e}") - return None - except Exception as e: - print(f"An unexpected error occurred: {e}") - return None - - return df \ No newline at end of file diff --git a/source_collectors/muckrock/constants.py b/source_collectors/muckrock/constants.py index 07dca8f4..f152d8c4 100644 --- a/source_collectors/muckrock/constants.py +++ b/source_collectors/muckrock/constants.py @@ -1,4 +1,3 @@ BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1" -FOIA_DATA_DB = "foia_data.db" \ No newline at end of file diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py deleted file mode 100644 index 9114801c..00000000 --- a/source_collectors/muckrock/create_foia_data_db.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -create_foia_data_db.py - -This script fetches data from the MuckRock FOIA API and stores it in a SQLite database. -Run this prior to companion script `search_foia_data_db.py`. - -A successful run will output a SQLite database `foia_data.db` with one table `results`. -The database will contain all FOIA requests available through MuckRock. - -Functions: - - create_db() - - fetch_page() - - transform_page_data() - - populate_db() - - main() - -Error Handling: -Errors encountered during API requests or database operations are logged to an `errors.log` file -and/or printed to the console. -""" - -import json -import logging -import os -import time -from typing import List, Tuple, Dict, Any - -from tqdm import tqdm - -from source_collectors.muckrock.classes.SQLiteClient import SQLiteClientContextManager, SQLClientError -from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher -from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError - -logging.basicConfig( - filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s" -) - -# TODO: Why are we pulling every single FOIA request? - -last_page_fetched = "last_page_fetched.txt" - -NO_MORE_DATA = -1 # flag for program exit -JSON = Dict[str, Any] # type alias - - -create_table_query = """ - CREATE TABLE IF NOT EXISTS results ( - id INTEGER PRIMARY KEY, - title TEXT, - slug TEXT, - status TEXT, - embargo_status TEXT, - user INTEGER, - username TEXT, - agency INTEGER, - datetime_submitted TEXT, - date_due TEXT, - days_until_due INTEGER, - date_followup TEXT, - datetime_done TEXT, - datetime_updated TEXT, - date_embargo TEXT, - tracking_id TEXT, - price TEXT, - disable_autofollowups BOOLEAN, - tags TEXT, - communications TEXT, - absolute_url TEXT - ) - """ - - -foia_insert_query = """ - INSERT INTO results (id, title, slug, status, embargo_status, user, username, agency, - datetime_submitted, date_due, days_until_due, date_followup, - datetime_done, datetime_updated, date_embargo, tracking_id, - price, disable_autofollowups, tags, communications, absolute_url) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - - -def create_db() -> bool: - """ - Creates foia_data.db SQLite database with one table named `results`. - - Returns: - bool: True, if database is successfully created; False otherwise. - - Raises: - sqlite3.Error: If the table creation operation fails, - prints error and returns False. - """ - with SQLiteClientContextManager("foia_data.db") as client: - try: - client.execute_query(create_table_query) - return True - except SQLClientError as e: - print(f"SQLite error: {e}.") - logging.error(f"Failed to create foia_data.db due to SQLite error: {e}") - return False - -def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: - """ - Transforms the data received from the MuckRock FOIA API - into a structured format for insertion into a database with `populate_db()`. - - Transforms JSON input into a list of tuples, - as well as serializes the nested `tags` and `communications` fields - into JSON strings. - - Args: - data_to_transform: The JSON data from the API response. - Returns: - A list of tuples, where each tuple contains the fields - of a single FOIA request. - """ - - transformed_data = [] - - for result in data_to_transform.get("results", []): - result["tags"] = json.dumps(result.get("tags", [])) - result["communications"] = json.dumps(result.get("communications", [])) - - transformed_data.append( - ( - result["id"], - result["title"], - result["slug"], - result["status"], - result["embargo_status"], - result["user"], - result["username"], - result["agency"], - result["datetime_submitted"], - result["date_due"], - result["days_until_due"], - result["date_followup"], - result["datetime_done"], - result["datetime_updated"], - result["date_embargo"], - result["tracking_id"], - result["price"], - result["disable_autofollowups"], - result["tags"], - result["communications"], - result["absolute_url"], - ) - ) - return transformed_data - - -def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: - """ - Populates foia_data.db SQLite database with the transfomed FOIA request data. - - Args: - transformed_data (List[Tuple[Any, ...]]): A list of tuples, where each tuple contains the fields of a single FOIA request. - page (int): The current page number for printing and logging errors. - - Returns: - None - - Raises: - sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are - exhausted, logs error and exits. - """ - with SQLiteClientContextManager("foia_data.db") as client: - retries = 0 - max_retries = 2 - while retries < max_retries: - try: - client.execute_query(foia_insert_query, many=transformed_data) - print("Successfully inserted data!") - return - except SQLClientError as e: - print(f"{e}. Retrying...") - retries += 1 - time.sleep(1) - - if retries == max_retries: - report_max_retries_error(max_retries, page) - - -def report_max_retries_error(max_retries, page): - print( - f"Failed to insert data from page {page} after { - max_retries} attempts. Skipping to next page." - ) - logging.error( - f"Failed to insert data from page {page} after { - max_retries} attempts." - ) - - -def main() -> None: - """ - Main entry point for create_foia_data_db.py. - - This function orchestrates the process of fetching - FOIA requests data from the MuckRock FOIA API, transforming it, - and storing it in a SQLite database. - """ - - if not os.path.exists("foia_data.db"): - print("Creating foia_data.db...") - success = create_db() - if success == False: - print("Failed to create foia_data.db") - return - - start_page = get_start_page() - fetcher = FOIAFetcher( - start_page=start_page - ) - - with tqdm(initial=start_page, unit="page") as pbar: - while True: - - # TODO: Build collector that does similar logic - try: - pbar.update() - page_data = fetcher.fetch_next_page() - except MuckrockNoMoreDataError: - # Exit program because no more data exists - break - if page_data is None: - continue - transformed_data = transform_page_data(page_data) - populate_db(transformed_data, fetcher.current_page) - - with open(last_page_fetched, mode="w") as file: - file.write(str(fetcher.current_page)) - - print("create_foia_data_db.py run finished") - - -def get_start_page(): - """ - Returns the page number to start fetching from. - - If the file `last_page_fetched` exists, - reads the page number from the file and returns it + 1. - Otherwise, returns 1. - """ - if os.path.exists(last_page_fetched): - with open(last_page_fetched, mode="r") as file: - page = int(file.read()) + 1 - else: - page = 1 - return page - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"An unexpected error occurred: {e}") - print( - "Check errors.log to review errors. Run create_foia_data_db.py again to continue" - ) diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py deleted file mode 100644 index ddfb1d60..00000000 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Get Allegheny County FOIA requests -and save them to a JSON file - -""" - -from source_collectors.muckrock.classes.fetch_requests.FOIALoopFetchRequest import FOIALoopFetchRequest -from source_collectors.muckrock.classes.muckrock_fetchers import JurisdictionLoopFetchRequest, \ - JurisdictionLoopFetcher -from source_collectors.muckrock.classes.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetcher -from source_collectors.muckrock.utils import save_json_file - - -def fetch_jurisdiction_ids(town_file, level="l", parent=126): - """ - fetch jurisdiction IDs based on town names from a text file - """ - with open(town_file, "r") as file: - town_names = [line.strip() for line in file] - - request = JurisdictionLoopFetchRequest( - level=level, parent=parent, town_names=town_names - ) - - fetcher = JurisdictionLoopFetcher(request) - fetcher.loop_fetch() - return fetcher.jurisdictions - - - -def fetch_foia_data(jurisdiction_ids): - """ - fetch FOIA data for each jurisdiction ID and save it to a JSON file - """ - all_data = [] - for name, id_ in jurisdiction_ids.items(): - print(f"\nFetching records for {name}...") - request = FOIALoopFetchRequest(jurisdiction=id_) - fetcher = FOIALoopFetcher(request) - fetcher.loop_fetch() - all_data.extend(fetcher.ffm.results) - - # Save the combined data to a JSON file - save_json_file(file_path="foia_data_combined.json", data=all_data) - print(f"Saved {len(all_data)} records to foia_data_combined.json") - - -def main(): - """ - Execute the script - """ - town_file = "allegheny-county-towns.txt" - # Fetch jurisdiction IDs based on town names - jurisdiction_ids = fetch_jurisdiction_ids( - town_file, - level="l", - parent=126 - ) - print(f"Jurisdiction IDs fetched: {jurisdiction_ids}") - - # Fetch FOIA data for each jurisdiction ID - fetch_foia_data(jurisdiction_ids) - - -# Run the main function -if __name__ == "__main__": - main() diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py deleted file mode 100644 index ede7d1de..00000000 --- a/source_collectors/muckrock/search_foia_data_db.py +++ /dev/null @@ -1,141 +0,0 @@ -""" -search_foia_data_db.py - -This script provides search functionality for the `foia_data.db` SQLite database. The search looks in `title`s and -`tags` of FOIA requests that match an input string provided by the user. -Run this after companion script `create_foia_data_db.py`. - -A successful run will output a JSON file containing entries matching the search string. - -Functions: - - parser_init() - - search_foia_db() - - parse_communications_column() - - generate_json() - - main() - -Error Handling: -Errors encountered during database operations, JSON parsing, or file writing are printed to the console. -""" - -import argparse -import json -from typing import Union, List, Dict - -import pandas as pd - -from source_collectors.muckrock.classes.FOIADBSearcher import FOIADBSearcher - - -def parser_init() -> argparse.ArgumentParser: - """ - Initializes the argument parser for search_foia_data_db.py. - - Returns: - argparse.ArgumentParser: The configured argument parser. - """ - - parser = argparse.ArgumentParser( - description="Search foia_data.db and generate a JSON file of resulting matches" - ) - parser.add_argument( - "--search_for", - type=str, - required=True, - metavar="", - help="Provide a string to search foia_data.db", - ) - - return parser - - -def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: - searcher = FOIADBSearcher() - return searcher.search(search_string) - - -def parse_communications_column(communications) -> List[Dict]: - """ - Parses a communications column value, decoding it from JSON format. - - Args: - communications : The input value to be parsed, which can be a JSON string or NaN. - - Returns: - list (List[Dict]): A list containing the parsed JSON data. If the input is NaN (missing values) or - there is a JSON decoding error, an empty list is returned. - - Raises: - json.JSONDecodeError: If deserialization fails, prints error and returns empty list. - """ - - if pd.isna(communications): - return [] - try: - return json.loads(communications) - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - return [] - - -def generate_json(df: pd.DataFrame, search_string: str) -> None: - """ - Generates a JSON file from a pandas DataFrame. - - Args: - df (pandas.DataFrame): The DataFrame containing the data to be written to the JSON file. - - search_string (str): The string used to name the output JSON file. Spaces in the string - are replaced with underscores. - - Returns: - None - - Raises: - Exception: If writing to JSON file operation fails, prints error and returns. - """ - - output_json = f"{search_string.replace(' ', '_')}.json" - - try: - df.to_json(output_json, orient="records", indent=4) - print(f'Matching entries written to "{output_json}"') - except Exception as e: - print(f"An error occurred while writing JSON: {e}") - - -def main() -> None: - """ - Function to search the foia_data.db database for entries matching a specified search string. - - Command Line Args: - --search_for (str): A string to search for in the `title` and `tags` fields of FOIA requests. - """ - - parser = parser_init() - args = parser.parse_args() - search_string = args.search_for - - df = search_foia_db(search_string) - if df is None: - return - update_communications_column(df) - - announce_matching_entries(df, search_string) - - generate_json(df, search_string) - - -def announce_matching_entries(df, search_string): - print( - f'Found {df.shape[0]} matching entries containing "{search_string}" in the title or tags' - ) - - -def update_communications_column(df): - if not df["communications"].empty: - df["communications"] = df["communications"].apply(parse_communications_column) - - -if __name__ == "__main__": - main() From c743cdd3b7e72365edb09d56c0697d9ef0545fb6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 24 Jan 2025 15:34:37 -0500 Subject: [PATCH 008/182] Add url_record_type_labeling directory and add init files to hugging_face directory and subdirectories --- hugging_face/__init__.py | 0 hugging_face/example/__init__.py | 0 hugging_face/testing/__init__.py | 0 hugging_face/url_record_type_labeling/__init__.py | 0 hugging_face/url_relevance/__init__.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 hugging_face/__init__.py create mode 100644 hugging_face/example/__init__.py create mode 100644 hugging_face/testing/__init__.py create mode 100644 hugging_face/url_record_type_labeling/__init__.py create mode 100644 hugging_face/url_relevance/__init__.py diff --git a/hugging_face/__init__.py b/hugging_face/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hugging_face/example/__init__.py b/hugging_face/example/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hugging_face/testing/__init__.py b/hugging_face/testing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hugging_face/url_record_type_labeling/__init__.py b/hugging_face/url_record_type_labeling/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hugging_face/url_relevance/__init__.py b/hugging_face/url_relevance/__init__.py new file mode 100644 index 00000000..e69de29b From 076fbbba8c233e6dc1486cc15e2a595e07fe1b13 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 24 Jan 2025 15:35:01 -0500 Subject: [PATCH 009/182] Add url_record_type_labeling directory to Dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index b2949405..86bd21b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,7 @@ COPY collector_manager ./collector_manager COPY core ./core COPY html_tag_collector ./html_tag_collector COPY hugging_face/url_relevance ./hugging_face/url_relevance +COPY hugging_face/url_record_type_labeling ./hugging_face/url_record_type_labeling COPY hugging_face/HuggingFaceInterface.py ./hugging_face/HuggingFaceInterface.py COPY source_collectors ./source_collectors COPY util ./util From 8f81089c079824bb2b399bca88e8a064dcf5a3df Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 27 Jan 2025 09:12:56 -0500 Subject: [PATCH 010/182] Remove unused json files --- html_tag_collector/url_cache.json | 5 ----- html_tag_collector/urls.json | 17 ----------------- 2 files changed, 22 deletions(-) delete mode 100644 html_tag_collector/url_cache.json delete mode 100644 html_tag_collector/urls.json diff --git a/html_tag_collector/url_cache.json b/html_tag_collector/url_cache.json deleted file mode 100644 index d4a340e1..00000000 --- a/html_tag_collector/url_cache.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "http://www.example.com": "Example Domain", - "http://www.google.com": "Google", - "https://books.toscrape.com": "\n All products | Books to Scrape - Sandbox\n" -} \ No newline at end of file diff --git a/html_tag_collector/urls.json b/html_tag_collector/urls.json deleted file mode 100644 index 79574f93..00000000 --- a/html_tag_collector/urls.json +++ /dev/null @@ -1,17 +0,0 @@ -[{ - "id": 1, - "url": "https://pdap.io", - "label": "Label" -}, { - "id": 2, - "url": "https://pdapio.io", - "label": "Label" -}, { - "id": 3, - "url": "https://pdap.dev", - "label": "Label" -}, { - "id": 4, - "url": "https://pdap.io/404test", - "label": "Label" -}] From 4c9f03f8d9e91d1fd955922d9150d57b36643bf5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 27 Jan 2025 09:37:21 -0500 Subject: [PATCH 011/182] Begin draft --- core/enums.py | 40 +++++++++- hugging_face/HuggingFaceInterface.py | 6 +- hugging_face/URLClassifier.py | 10 +++ llm_api_logic/DeepSeekRecordClassifier.py | 92 +++++++++++++++++++++++ llm_api_logic/__init__.py | 0 requirements.txt | 2 +- 6 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 hugging_face/URLClassifier.py create mode 100644 llm_api_logic/DeepSeekRecordClassifier.py create mode 100644 llm_api_logic/__init__.py diff --git a/core/enums.py b/core/enums.py index 69505406..605e49e5 100644 --- a/core/enums.py +++ b/core/enums.py @@ -9,4 +9,42 @@ class BatchStatus(Enum): class LabelStudioTaskStatus(Enum): PENDING = "pending" - COMPLETED = "completed" \ No newline at end of file + COMPLETED = "completed" + +class RecordType(Enum): + ACCIDENT_REPORTS = "Accident Reports" + ARREST_RECORDS = "Arrest Records" + CALLS_FOR_SERVICE = "Calls for Service" + CAR_GPS = "Car GPS" + CITATIONS = "Citations" + DISPATCH_LOGS = "Dispatch Logs" + DISPATCH_RECORDINGS = "Dispatch Recordings" + FIELD_CONTACTS = "Field Contacts" + INCIDENT_REPORTS = "Incident Reports" + MISC_POLICE_ACTIVITY = "Misc Police Activity" + OFFICER_INVOLVED_SHOOTINGS = "Officer Involved Shootings" + STOPS = "Stops" + SURVEYS = "Surveys" + USE_OF_FORCE_REPORTS = "Use of Force Reports" + VEHICLE_PURSUITS = "Vehicle Pursuits" + COMPLAINTS_AND_MISCONDUCT = "Complaints & Misconduct" + DAILY_ACTIVITY_LOGS = "Daily Activity Logs" + TRAINING_AND_HIRING_INFO = "Training & Hiring Info" + PERSONNEL_RECORDS = "Personnel Records" + ANNUAL_AND_MONTHLY_REPORTS = "Annual & Monthly Reports" + BUDGETS_AND_FINANCES = "Budgets & Finances" + CONTACT_INFO_AND_AGENCY_META = "Contact Info & Agency Meta" + GEOGRAPHIC = "Geographic" + LIST_OF_DATA_SOURCES = "List of Data Sources" + POLICIES_AND_CONTRACTS = "Policies & Contracts" + CRIME_MAPS_AND_REPORTS = "Crime Maps & Reports" + CRIME_STATISTICS = "Crime Statistics" + MEDIA_BULLETINS = "Media Bulletins" + RECORDS_REQUEST_INFO = "Records Request Info" + RESOURCES = "Resources" + SEX_OFFENDER_REGISTRY = "Sex Offender Registry" + WANTED_PERSONS = "Wanted Persons" + BOOKING_REPORTS = "Booking Reports" + COURT_CASES = "Court Cases" + INCARCERATION_RECORDS = "Incarceration Records" + OTHER = "Other" diff --git a/hugging_face/HuggingFaceInterface.py b/hugging_face/HuggingFaceInterface.py index efb54b75..2ea635d5 100644 --- a/hugging_face/HuggingFaceInterface.py +++ b/hugging_face/HuggingFaceInterface.py @@ -1,12 +1,14 @@ from transformers import pipeline from collector_db.DTOs.URLWithHTML import URLWithHTML +from hugging_face.URLClassifier import URLClassifier class HuggingFaceInterface: def __init__(self): - self.pipe = pipeline("text-classification", model="PDAP/url-relevance") + self.relevance_pipe = pipeline("text-classification", model="PDAP/url-relevance") + self.url_classifier = URLClassifier() def get_url_relevancy( self, @@ -14,7 +16,7 @@ def get_url_relevancy( threshold: float = 0.5 ) -> list[bool]: urls = [url_with_html.url for url_with_html in urls_with_html] - results: list[dict] = self.pipe(urls) + results: list[dict] = self.relevance_pipe(urls) bool_results = [] for result in results: diff --git a/hugging_face/URLClassifier.py b/hugging_face/URLClassifier.py new file mode 100644 index 00000000..04380645 --- /dev/null +++ b/hugging_face/URLClassifier.py @@ -0,0 +1,10 @@ +from multimodal_transformers.model import DistilBertWithTabular +from transformers import AutoTokenizer + + +class URLClassifier: + + def __init__(self): + self.tokenizer = AutoTokenizer.from_pretrained("PDAP/url-classifier") + self.model = DistilBertWithTabular.from_pretrained("PDAP/url-classifier") + self.model.eval() diff --git a/llm_api_logic/DeepSeekRecordClassifier.py b/llm_api_logic/DeepSeekRecordClassifier.py new file mode 100644 index 00000000..dde870b6 --- /dev/null +++ b/llm_api_logic/DeepSeekRecordClassifier.py @@ -0,0 +1,92 @@ +import os + +from openai import OpenAI + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from collector_db.DTOs.URLWithHTML import URLWithHTML +from core.enums import RecordType + +QUERY_CONTENT = """ + You will be provided with structured data from a web page and determine + the record type. + + The record types are as follows + + "Accident Reports": Records of vehicle accidents. + "Arrest Records": Records of each arrest made in the agency's jurisdiction. + "Calls for Service": Records of officers initiating activity or responding to requests for police response. Often called "Dispatch Logs" or "Incident Reports" when published. + "Car GPS": Records of police car location. Not generally posted online. + "Citations": Records of low-level criminal offenses where a police officer issued a citation instead of an arrest. + "Dispatch Logs": Records of calls or orders made by police dispatchers. + "Dispatch Recordings": Audio feeds and/or archives of municipal dispatch channels. + "Field Contacts": Reports of contact between police and civilians. May include uses of force, incidents, arrests, or contacts where nothing notable happened. + "Incident Reports": Reports made by police officers after responding to a call which may or may not be criminal in nature. Not generally posted online. + "Misc Police Activity": Records or descriptions of police activity not covered by other record types. + "Officer Involved Shootings": Case files of gun violence where a police officer was involved, typically as the shooter. Detailed, often containing references to records like Media Bulletins and Use of Force Reports. + "Stops": Records of pedestrian or traffic stops made by police. + "Surveys": Information captured from a sample of some population, like incarcerated people or magistrate judges. Often generated independently. + "Use of Force Reports": Records of use of force against civilians by police officers. + "Vehicle Pursuits": Records of cases where police pursued a person fleeing in a vehicle. + "Complaints & Misconduct": Records, statistics, or summaries of complaints and misconduct investigations into law enforcement officers. + "Daily Activity Logs": Officer-created reports or time sheets of what happened on a shift. Not generally posted online. + "Training & Hiring Info": Records and descriptions of additional training for police officers. + "Personnel Records": Records of hiring and firing, certification, discipline, and other officer-specific events. Not generally posted online. + "Annual & Monthly Reports": Often in PDF form, featuring summaries or high-level updates about the police force. Can contain versions of other record types, especially summaries. + "Budgets & Finances": Budgets, finances, grants, or other financial documents. + "Contact Info & Agency Meta": Information about organizational structure, including department structure and contact info. + "Geographic": Maps or geographic data about how land is divided up into municipal sectors, zones, and jurisdictions. + "List of Data Sources": Places on the internet, often data portal homepages, where many links to potential data sources can be found. + "Policies & Contracts": Policies or contracts related to agency procedure. + "Crime Maps & Reports": Records of individual crimes in map or table form for a given jurisdiction. + "Crime Statistics": Summarized information about crime in a given jurisdiction. + "Media Bulletins": Press releases, blotters, or blogs intended to broadly communicate alerts, requests, or other timely information. + "Records Request Info": Portals, forms, policies, or other resources for making public records requests. + "Resources": Agency-provided information or guidance about services, prices, best practices, etc. + "Sex Offender Registry": Index of people registered, usually by law, with the government as sex offenders. + "Wanted Persons": Names, descriptions, images, and associated information about people with outstanding arrest warrants. + "Booking Reports": Records of booking or intake into corrections institutions. + "Court Cases": Records such as dockets about individual court cases. + "Incarceration Records": Records of current inmates, often with full names and features for notification upon inmate release. + "Other": Other record types not otherwise described. + + Output the record type in the following format. Do not include any other information: + + { + "record_type": "" + } + """ + +def dictify_html_info(html_info: URLHTMLContentInfo) -> dict: + + + +class DeepSeekRecordClassifier: + + def __init__(self): + self.client = OpenAI( + api_key=os.getenv("DEEPSEEK_API_KEY"), + base_url="https://api.deepseek.com" + ) + + def build_query_messages(self, html_info: URLHTMLContentInfo) -> list[dict[str, str]]: + insert_content = dictify_html_info(html_info) + return [ + { + "role": "system", + "content": QUERY_CONTENT + }, + { + "role": "user", + "content": f"```json{insert_content}```" + } + ] + + def classify_url(self, url_with_html: URLWithHTML) -> RecordType: + response = self.client.chat.completions.create( + model="deepseek-chat", + messages=self.build_query_messages(url_with_html.html_infos[0]), + stream=False, + response_format={ + 'type': 'json_object' + } + ) diff --git a/llm_api_logic/__init__.py b/llm_api_logic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/requirements.txt b/requirements.txt index c05cfbfa..2cc28614 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,4 +45,4 @@ PyJWT~=2.10.1 # Tests pytest-timeout~=2.3.1 - +openai~=1.60.1 From 5cca756d48e256bb147f830ef7aada1cd02333d0 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 27 Jan 2025 11:07:20 -0500 Subject: [PATCH 012/182] Create first draft of DeepSeek record classifier and test --- llm_api_logic/DeepSeekRecordClassifier.py | 24 ++++++++------- tests/manual/llm_api_logic/__init__.py | 0 .../test_deepseek_record_classifier.py | 29 +++++++++++++++++++ 3 files changed, 42 insertions(+), 11 deletions(-) create mode 100644 tests/manual/llm_api_logic/__init__.py create mode 100644 tests/manual/llm_api_logic/test_deepseek_record_classifier.py diff --git a/llm_api_logic/DeepSeekRecordClassifier.py b/llm_api_logic/DeepSeekRecordClassifier.py index dde870b6..1348b358 100644 --- a/llm_api_logic/DeepSeekRecordClassifier.py +++ b/llm_api_logic/DeepSeekRecordClassifier.py @@ -1,9 +1,8 @@ import os -from openai import OpenAI +from openai import AsyncOpenAI from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo -from collector_db.DTOs.URLWithHTML import URLWithHTML from core.enums import RecordType QUERY_CONTENT = """ @@ -56,20 +55,22 @@ } """ -def dictify_html_info(html_info: URLHTMLContentInfo) -> dict: - - +def dictify_html_info(html_infos: list[URLHTMLContentInfo]) -> dict[str, str]: + d = {} + for html_info in html_infos: + d[html_info.content_type.value] = html_info.content + return d class DeepSeekRecordClassifier: def __init__(self): - self.client = OpenAI( + self.client = AsyncOpenAI( api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com" ) - def build_query_messages(self, html_info: URLHTMLContentInfo) -> list[dict[str, str]]: - insert_content = dictify_html_info(html_info) + def build_query_messages(self, content_infos: list[URLHTMLContentInfo]) -> list[dict[str, str]]: + insert_content = dictify_html_info(content_infos) return [ { "role": "system", @@ -81,12 +82,13 @@ def build_query_messages(self, html_info: URLHTMLContentInfo) -> list[dict[str, } ] - def classify_url(self, url_with_html: URLWithHTML) -> RecordType: - response = self.client.chat.completions.create( + async def classify_url(self, content_infos: list[URLHTMLContentInfo]) -> RecordType: + response = await self.client.chat.completions.create( model="deepseek-chat", - messages=self.build_query_messages(url_with_html.html_infos[0]), + messages=self.build_query_messages(content_infos), stream=False, response_format={ 'type': 'json_object' } ) + return RecordType(response["choices"][0]["message"]["content"]["record_type"]) diff --git a/tests/manual/llm_api_logic/__init__.py b/tests/manual/llm_api_logic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py new file mode 100644 index 00000000..48692023 --- /dev/null +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -0,0 +1,29 @@ +import pytest + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier + + +@pytest.mark.asyncio +async def test_deepseek_record_classifier(): + from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType as hct + + d = { + hct.TITLE: "test title", + hct.DESCRIPTION: "test description", + hct.H1: "test h1", + hct.H2: "test h2", + hct.H3: "test h3", + hct.H4: "test h4", + hct.H5: "test h5", + hct.H6: "test h6", + hct.DIV: "test div", + } + content_infos = [] + for k, v in d.items(): + content_info = URLHTMLContentInfo(content_type=k, content=v) + content_infos.append(content_info) + + classifier = DeepSeekRecordClassifier() + result = await classifier.classify_url(content_infos) + print(result) \ No newline at end of file From e1dce2f292b9304d9f860e9562e2d6a84576aec1 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 27 Jan 2025 12:23:56 -0500 Subject: [PATCH 013/182] Change "Cycle" term to "Task" for clarity. Add README description. --- collector_db/AsyncDatabaseClient.py | 48 ++++++++++ .../RelevanceLabelStudioInputCycleInfo.py | 9 -- collector_db/enums.py | 5 + collector_db/models.py | 1 + core/AsyncCore.py | 26 ++--- ...URLHTMLCycleInfo.py => URLHTMLTaskInfo.py} | 2 +- core/DTOs/URLRecordTypeTaskInfo.py | 11 +++ ....py => URLRelevanceHuggingfaceTaskInfo.py} | 2 +- core/README.md | 11 ++- core/ScheduledTaskManager.py | 2 +- core/classes/URLHTMLCycler.py | 95 ------------------- core/classes/URLHTMLTaskOperator.py | 95 +++++++++++++++++++ core/classes/URLRecordTypeTaskOperator.py | 24 +++++ ...=> URLRelevanceHuggingfaceTaskOperator.py} | 47 +++++---- .../test_html_tag_collector_integration.py | 10 +- .../test_url_relevancy_huggingface_cycle.py | 6 +- 16 files changed, 245 insertions(+), 149 deletions(-) delete mode 100644 collector_db/DTOs/RelevanceLabelStudioInputCycleInfo.py rename core/DTOs/{URLHTMLCycleInfo.py => URLHTMLTaskInfo.py} (94%) create mode 100644 core/DTOs/URLRecordTypeTaskInfo.py rename core/DTOs/{URLRelevanceHuggingfaceCycleInfo.py => URLRelevanceHuggingfaceTaskInfo.py} (78%) delete mode 100644 core/classes/URLHTMLCycler.py create mode 100644 core/classes/URLHTMLTaskOperator.py create mode 100644 core/classes/URLRecordTypeTaskOperator.py rename core/classes/{URLRelevanceHuggingfaceCycler.py => URLRelevanceHuggingfaceTaskOperator.py} (51%) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index db94a8d5..e83d10ca 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -119,11 +119,59 @@ async def get_pending_urls_without_html_data(self, session: AsyncSession): scalar_result = await session.scalars(statement) return scalar_result.all() + @session_manager + async def get_urls_with_html_data_and_without_metadata_type( + self, + session: AsyncSession, + without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT + ): + + # TODO: Generalize this so that it can exclude based on other attributes + # Get URLs with no relevancy metadata + statement = (select(URL.id, URL.url, URLHTMLContent). + join(URLHTMLContent). + where(URL.outcome == URLStatus.PENDING.value) + # No relevancy metadata + .where( + ~exists( + select(URLMetadata.id). + where( + URLMetadata.url_id == URL.id, + URLMetadata.attribute == without_metadata_type.value + ) + ) + ) + .limit(100) + .order_by(URL.id) + ) + raw_result = await session.execute(statement) + result = raw_result.all() + url_ids_to_urls = {url_id: url for url_id, url, _ in result} + url_ids_to_html_info = {url_id: [] for url_id, _, _ in result} + + for url_id, _, html_info in result: + url_ids_to_html_info[url_id].append( + URLHTMLContentInfo(**html_info.__dict__) + ) + + final_results = [] + for url_id, url in url_ids_to_urls.items(): + url_with_html = URLWithHTML( + url_id=url_id, + url=url, + html_infos=url_ids_to_html_info[url_id] + ) + final_results.append(url_with_html) + + + return final_results + @session_manager async def get_urls_with_html_data_and_no_relevancy_metadata( self, session: AsyncSession ) -> list[URLWithHTML]: + # TODO: Generalize this so that it can exclude based on other attributes # Get URLs with no relevancy metadata statement = (select(URL.id, URL.url, URLHTMLContent). join(URLHTMLContent). diff --git a/collector_db/DTOs/RelevanceLabelStudioInputCycleInfo.py b/collector_db/DTOs/RelevanceLabelStudioInputCycleInfo.py deleted file mode 100644 index 644e0e27..00000000 --- a/collector_db/DTOs/RelevanceLabelStudioInputCycleInfo.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo - - -class RelevanceLabelStudioInputCycleInfo(BaseModel): - url: str - metadata_id: int - html_content_info: list[URLHTMLContentInfo] \ No newline at end of file diff --git a/collector_db/enums.py b/collector_db/enums.py index fa66aac4..66734a9c 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -32,6 +32,11 @@ class URLHTMLContentType(PyEnum): H6 = "H6" DIV = "Div" +class TaskType(PyEnum): + HTML = "HTML" + RELEVANCY = "Relevancy" + RECORD_TYPE = "Record Type" + class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/collector_db/models.py b/collector_db/models.py index 273d956f..03837a0b 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -158,6 +158,7 @@ class URLErrorInfo(Base): url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) error = Column(Text, nullable=False) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + # TODO: Add Info on Cycle the task occurred in # Relationships url = relationship("URL", back_populates="error_info") diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 67f134b1..de70abd2 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -6,8 +6,8 @@ from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo -from core.classes.URLHTMLCycler import URLHTMLCycler -from core.classes.URLRelevanceHuggingfaceCycler import URLRelevanceHuggingfaceCycler +from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from html_tag_collector.DataClassTags import convert_to_response_html_info from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface @@ -30,26 +30,26 @@ def __init__( self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) - async def run_url_html_cycle(self): - self.logger.info("Running URL HTML Cycle") - cycler = URLHTMLCycler( + async def run_url_html_task(self): + self.logger.info("Running URL HTML Task") + operator = URLHTMLTaskOperator( adb_client=self.adb_client, url_request_interface=self.url_request_interface, html_parser=self.html_parser ) - await cycler.cycle() + await operator.run_task() - async def run_url_relevance_huggingface_cycle(self): - self.logger.info("Running URL Relevance Huggingface Cycle") - cycler = URLRelevanceHuggingfaceCycler( + async def run_url_relevance_huggingface_task(self): + self.logger.info("Running URL Relevance Huggingface Task") + operator = URLRelevanceHuggingfaceTaskOperator( adb_client=self.adb_client, huggingface_interface=self.huggingface_interface ) - await cycler.cycle() + await operator.run_task() - async def run_cycles(self): - await self.run_url_html_cycle() - await self.run_url_relevance_huggingface_cycle() + async def run_tasks(self): + await self.run_url_html_task() + await self.run_url_relevance_huggingface_task() async def convert_to_relevance_annotation_request_info(self, url_info: URLAnnotationInfo) -> RelevanceAnnotationRequestInfo: response_html_info = convert_to_response_html_info( diff --git a/core/DTOs/URLHTMLCycleInfo.py b/core/DTOs/URLHTMLTaskInfo.py similarity index 94% rename from core/DTOs/URLHTMLCycleInfo.py rename to core/DTOs/URLHTMLTaskInfo.py index 1d739375..cff69e4f 100644 --- a/core/DTOs/URLHTMLCycleInfo.py +++ b/core/DTOs/URLHTMLTaskInfo.py @@ -7,7 +7,7 @@ @dataclass -class URLHTMLCycleInfo: +class URLHTMLTaskInfo: url_info: URLInfo url_response_info: Optional[URLResponseInfo] = None html_tag_info: Optional[ResponseHTMLInfo] = None diff --git a/core/DTOs/URLRecordTypeTaskInfo.py b/core/DTOs/URLRecordTypeTaskInfo.py new file mode 100644 index 00000000..6c5d8ea7 --- /dev/null +++ b/core/DTOs/URLRecordTypeTaskInfo.py @@ -0,0 +1,11 @@ +from typing import Optional + +from pydantic import BaseModel + +from collector_db.DTOs.URLWithHTML import URLWithHTML +from core.enums import RecordType + + +class URLRecordTypeTaskInfo(BaseModel): + url_with_html: URLWithHTML + record_type: Optional[RecordType] = None \ No newline at end of file diff --git a/core/DTOs/URLRelevanceHuggingfaceCycleInfo.py b/core/DTOs/URLRelevanceHuggingfaceTaskInfo.py similarity index 78% rename from core/DTOs/URLRelevanceHuggingfaceCycleInfo.py rename to core/DTOs/URLRelevanceHuggingfaceTaskInfo.py index 19318e6a..bb4553d1 100644 --- a/core/DTOs/URLRelevanceHuggingfaceCycleInfo.py +++ b/core/DTOs/URLRelevanceHuggingfaceTaskInfo.py @@ -5,6 +5,6 @@ from collector_db.DTOs.URLWithHTML import URLWithHTML -class URLRelevanceHuggingfaceCycleInfo(BaseModel): +class URLRelevanceHuggingfaceTaskInfo(BaseModel): url_with_html: URLWithHTML relevant: Optional[bool] = None diff --git a/core/README.md b/core/README.md index c9095c41..25b1cde3 100644 --- a/core/README.md +++ b/core/README.md @@ -2,4 +2,13 @@ The Source Collector Core is a directory which integrates: 1. The Collector Manager 2. The Source Collector Database 3. The API (to be developed) -4. The PDAP API Client (to be developed) \ No newline at end of file +4. The PDAP API Client (to be developed) + +# Nomenclature + +- **Collector**: A submodule for collecting URLs. Different collectors utilize different sources and different methods for gathering URLs. +- **Batch**: URLs are collected in Collector Batches, with different collectors producing different Batches. +- **Cycle**: Refers to the overall lifecycle for Each URL -- from initial retrieval in a Batch to either disposal or incorporation into the Data Sources App Database +- **Task**: A semi-independent operation performed on a set of URLs. These include: Collection, retrieving HTML data, getting metadata via Machine Learning, and so on. +- **Task Set**: Refers to a group of URLs that are operated on together as part of a single task. These URLs in a set are not necessarily all from the same batch. URLs in a task set should only be operated on in that task once. +- **Task Operator**: A class which performs a single task on a set of URLs. \ No newline at end of file diff --git a/core/ScheduledTaskManager.py b/core/ScheduledTaskManager.py index 590690d1..e061adee 100644 --- a/core/ScheduledTaskManager.py +++ b/core/ScheduledTaskManager.py @@ -52,7 +52,7 @@ def __init__(self, async_core: AsyncCore): def add_scheduled_tasks(self): self.run_cycles_job = self.scheduler.add_job( - self.async_core.run_cycles, + self.async_core.run_tasks, trigger=IntervalTrigger( hours=1, start_date=datetime.now() + timedelta(minutes=1) diff --git a/core/classes/URLHTMLCycler.py b/core/classes/URLHTMLCycler.py deleted file mode 100644 index 73344a9c..00000000 --- a/core/classes/URLHTMLCycler.py +++ /dev/null @@ -1,95 +0,0 @@ -from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo -from collector_db.DTOs.URLInfo import URLInfo -from core.DTOs.URLHTMLCycleInfo import URLHTMLCycleInfo -from core.classes.HTMLContentInfoGetter import HTMLContentInfoGetter -from html_tag_collector.ResponseParser import HTMLResponseParser -from html_tag_collector.URLRequestInterface import URLRequestInterface - - -class URLHTMLCycler: - - def __init__( - self, - url_request_interface: URLRequestInterface, - adb_client: AsyncDatabaseClient, - html_parser: HTMLResponseParser - ): - self.url_request_interface = url_request_interface - self.adb_client = adb_client - self.html_parser = html_parser - - async def cycle(self): - print("Running URL HTML Cycle...") - cycle_infos = await self.get_pending_urls_without_html_data() - await self.get_raw_html_data_for_urls(cycle_infos) - success_cycles, error_cycles = await self.separate_success_and_error_cycles(cycle_infos) - await self.update_errors_in_database(error_cycles) - await self.process_html_data(success_cycles) - await self.update_html_data_in_database(success_cycles) - - - async def get_just_urls(self, cycle_infos: list[URLHTMLCycleInfo]): - return [cycle_info.url_info.url for cycle_info in cycle_infos] - - async def get_pending_urls_without_html_data(self): - pending_urls: list[URLInfo] = await self.adb_client.get_pending_urls_without_html_data() - cycle_infos = [ - URLHTMLCycleInfo( - url_info=url_info, - ) for url_info in pending_urls - ] - return cycle_infos - - async def get_raw_html_data_for_urls(self, cycle_infos: list[URLHTMLCycleInfo]): - just_urls = await self.get_just_urls(cycle_infos) - url_response_infos = await self.url_request_interface.make_requests(just_urls) - for cycle_info, url_response_info in zip(cycle_infos, url_response_infos): - cycle_info.url_response_info = url_response_info - - async def separate_success_and_error_cycles( - self, - cycle_infos: list[URLHTMLCycleInfo] - ) -> tuple[ - list[URLHTMLCycleInfo], # Successful - list[URLHTMLCycleInfo] # Error - ]: - errored_cycle_infos = [] - successful_cycle_infos = [] - for cycle_info in cycle_infos: - if not cycle_info.url_response_info.success: - errored_cycle_infos.append(cycle_info) - else: - successful_cycle_infos.append(cycle_info) - return successful_cycle_infos, errored_cycle_infos - - async def update_errors_in_database(self, errored_cycle_infos: list[URLHTMLCycleInfo]): - error_infos = [] - for errored_cycle_info in errored_cycle_infos: - error_info = URLErrorPydanticInfo( - url_id=errored_cycle_info.url_info.id, - error=str(errored_cycle_info.url_response_info.exception), - ) - error_infos.append(error_info) - await self.adb_client.add_url_error_infos(error_infos) - - async def process_html_data(self, cycle_infos: list[URLHTMLCycleInfo]): - for cycle_info in cycle_infos: - html_tag_info = await self.html_parser.parse( - url=cycle_info.url_info.url, - html_content=cycle_info.url_response_info.html, - content_type=cycle_info.url_response_info.content_type - ) - cycle_info.html_tag_info = html_tag_info - - async def update_html_data_in_database(self, cycle_infos: list[URLHTMLCycleInfo]): - html_content_infos = [] - for cycle_info in cycle_infos: - hcig = HTMLContentInfoGetter( - response_html_info=cycle_info.html_tag_info, - url_id=cycle_info.url_info.id - ) - results = hcig.get_all_html_content() - html_content_infos.extend(results) - - await self.adb_client.add_html_content_infos(html_content_infos) diff --git a/core/classes/URLHTMLTaskOperator.py b/core/classes/URLHTMLTaskOperator.py new file mode 100644 index 00000000..42c3e21a --- /dev/null +++ b/core/classes/URLHTMLTaskOperator.py @@ -0,0 +1,95 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo +from collector_db.DTOs.URLInfo import URLInfo +from core.DTOs.URLHTMLTaskInfo import URLHTMLTaskInfo +from core.classes.HTMLContentInfoGetter import HTMLContentInfoGetter +from html_tag_collector.ResponseParser import HTMLResponseParser +from html_tag_collector.URLRequestInterface import URLRequestInterface + + +class URLHTMLTaskOperator: + + def __init__( + self, + url_request_interface: URLRequestInterface, + adb_client: AsyncDatabaseClient, + html_parser: HTMLResponseParser + ): + self.url_request_interface = url_request_interface + self.adb_client = adb_client + self.html_parser = html_parser + + async def run_task(self): + print("Running URL HTML Task...") + task_infos = await self.get_pending_urls_without_html_data() + await self.get_raw_html_data_for_urls(task_infos) + success_subset, error_subset = await self.separate_success_and_error_subsets(task_infos) + await self.update_errors_in_database(error_subset) + await self.process_html_data(success_subset) + await self.update_html_data_in_database(success_subset) + + + async def get_just_urls(self, task_infos: list[URLHTMLTaskInfo]): + return [task_info.url_info.url for task_info in task_infos] + + async def get_pending_urls_without_html_data(self): + pending_urls: list[URLInfo] = await self.adb_client.get_pending_urls_without_html_data() + task_infos = [ + URLHTMLTaskInfo( + url_info=url_info, + ) for url_info in pending_urls + ] + return task_infos + + async def get_raw_html_data_for_urls(self, task_infos: list[URLHTMLTaskInfo]): + just_urls = await self.get_just_urls(task_infos) + url_response_infos = await self.url_request_interface.make_requests(just_urls) + for task_info, url_response_info in zip(task_infos, url_response_infos): + task_info.url_response_info = url_response_info + + async def separate_success_and_error_subsets( + self, + task_infos: list[URLHTMLTaskInfo] + ) -> tuple[ + list[URLHTMLTaskInfo], # Successful + list[URLHTMLTaskInfo] # Error + ]: + errored_task_infos = [] + successful_task_infos = [] + for task_info in task_infos: + if not task_info.url_response_info.success: + errored_task_infos.append(task_info) + else: + successful_task_infos.append(task_info) + return successful_task_infos, errored_task_infos + + async def update_errors_in_database(self, errored_task_infos: list[URLHTMLTaskInfo]): + error_infos = [] + for error_task_info in errored_task_infos: + error_info = URLErrorPydanticInfo( + url_id=error_task_info.url_info.id, + error=str(error_task_info.url_response_info.exception), + ) + error_infos.append(error_info) + await self.adb_client.add_url_error_infos(error_infos) + + async def process_html_data(self, task_infos: list[URLHTMLTaskInfo]): + for task_info in task_infos: + html_tag_info = await self.html_parser.parse( + url=task_info.url_info.url, + html_content=task_info.url_response_info.html, + content_type=task_info.url_response_info.content_type + ) + task_info.html_tag_info = html_tag_info + + async def update_html_data_in_database(self, task_infos: list[URLHTMLTaskInfo]): + html_content_infos = [] + for task_info in task_infos: + hcig = HTMLContentInfoGetter( + response_html_info=task_info.html_tag_info, + url_id=task_info.url_info.id + ) + results = hcig.get_all_html_content() + html_content_infos.extend(results) + + await self.adb_client.add_html_content_infos(html_content_infos) diff --git a/core/classes/URLRecordTypeTaskOperator.py b/core/classes/URLRecordTypeTaskOperator.py new file mode 100644 index 00000000..3b1b27b2 --- /dev/null +++ b/core/classes/URLRecordTypeTaskOperator.py @@ -0,0 +1,24 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import URLMetadataAttributeType +from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier + + +class URLRecordTypeTaskOperator: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + classifier: DeepSeekRecordClassifier + ): + self.adb_client = adb_client + self.classifier = classifier + + async def run_task(self): + # Get pending urls from Source Collector + # with HTML data and without Record Type Metadata + task_infos = await self.adb_client.get_pending_urls_without_html_data( + without_metadata_attribute=URLMetadataAttributeType.RECORD_TYPE + ) + + + async def get_ml_classifications(self, task_infos: list[URLRecordTypeTaskInfo]): \ No newline at end of file diff --git a/core/classes/URLRelevanceHuggingfaceCycler.py b/core/classes/URLRelevanceHuggingfaceTaskOperator.py similarity index 51% rename from core/classes/URLRelevanceHuggingfaceCycler.py rename to core/classes/URLRelevanceHuggingfaceTaskOperator.py index 8ffdb705..904adbe1 100644 --- a/core/classes/URLRelevanceHuggingfaceCycler.py +++ b/core/classes/URLRelevanceHuggingfaceTaskOperator.py @@ -2,11 +2,11 @@ from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from core.DTOs.URLRelevanceHuggingfaceCycleInfo import URLRelevanceHuggingfaceCycleInfo +from core.DTOs.URLRelevanceHuggingfaceTaskInfo import URLRelevanceHuggingfaceTaskInfo from hugging_face.HuggingFaceInterface import HuggingFaceInterface -class URLRelevanceHuggingfaceCycler: +class URLRelevanceHuggingfaceTaskOperator: def __init__( self, @@ -16,41 +16,48 @@ def __init__( self.adb_client = adb_client self.huggingface_interface = huggingface_interface - async def cycle(self): + async def run_task(self): # Get pending urls from Source Collector # with HTML data and without Relevancy Metadata - cycle_infos = await self.get_pending_url_info() + task_infos = await self.get_pending_url_info( + without_metadata_attribute=URLMetadataAttributeType.RELEVANT + ) # Pipe into Huggingface - await self.add_huggingface_relevancy(cycle_infos) + await self.add_huggingface_relevancy(task_infos) # Put results into Database - await self.put_results_into_database(cycle_infos) + await self.put_results_into_database(task_infos) - async def put_results_into_database(self, cycle_infos): + async def put_results_into_database(self, task_infos): url_metadatas = [] - for cycle_info in cycle_infos: + for task_info in task_infos: url_metadata = URLMetadataInfo( - url_id=cycle_info.url_with_html.url_id, + url_id=task_info.url_with_html.url_id, attribute=URLMetadataAttributeType.RELEVANT, - value=str(cycle_info.relevant), + value=str(task_info.relevant), validation_status=ValidationStatus.PENDING_VALIDATION, validation_source=ValidationSource.MACHINE_LEARNING ) url_metadatas.append(url_metadata) await self.adb_client.add_url_metadatas(url_metadatas) - async def add_huggingface_relevancy(self, cycle_infos: list[URLRelevanceHuggingfaceCycleInfo]): - urls_with_html = [cycle_info.url_with_html for cycle_info in cycle_infos] + async def add_huggingface_relevancy(self, task_infos: list[URLRelevanceHuggingfaceTaskInfo]): + urls_with_html = [task_info.url_with_html for task_info in task_infos] results = self.huggingface_interface.get_url_relevancy(urls_with_html) - for cycle_info, result in zip(cycle_infos, results): - cycle_info.relevant = result + for task_info, result in zip(task_infos, results): + task_info.relevant = result - async def get_pending_url_info(self) -> list[URLRelevanceHuggingfaceCycleInfo]: - cycle_infos = [] - pending_urls: list[URLWithHTML] = await self.adb_client.get_urls_with_html_data_and_no_relevancy_metadata() + async def get_pending_url_info( + self, + without_metadata_attribute: URLMetadataAttributeType + ) -> list[URLRelevanceHuggingfaceTaskInfo]: + task_infos = [] + pending_urls: list[URLWithHTML] = await self.adb_client.get_urls_with_html_data_and_without_metadata_type( + without_metadata_type=without_metadata_attribute + ) for url_with_html in pending_urls: - cycle_info = URLRelevanceHuggingfaceCycleInfo( + task_info = URLRelevanceHuggingfaceTaskInfo( url_with_html=url_with_html ) - cycle_infos.append(cycle_info) - return cycle_infos + task_infos.append(task_info) + return task_infos diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index cb803e96..1673ca42 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -3,7 +3,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo -from core.classes.URLHTMLCycler import URLHTMLCycler +from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from helpers.DBDataCreator import DBDataCreator from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache @@ -43,14 +43,14 @@ async def test_url_html_cycle_live_data( """ Tests the cycle on whatever exists in the DB """ - cycler = URLHTMLCycler( + operator = URLHTMLTaskOperator( adb_client=AsyncDatabaseClient(), url_request_interface=URLRequestInterface(), html_parser=HTMLResponseParser( root_url_cache=RootURLCache() ) ) - await cycler.cycle() + await operator.run_task() @pytest.mark.asyncio async def test_url_html_cycle( @@ -64,11 +64,11 @@ async def test_url_html_cycle( db_client.insert_urls(url_infos=url_infos, batch_id=batch_id) - cycler = URLHTMLCycler( + operator = URLHTMLTaskOperator( adb_client=AsyncDatabaseClient(), url_request_interface=URLRequestInterface(), html_parser=HTMLResponseParser( root_url_cache=RootURLCache() ) ) - await cycler.cycle() \ No newline at end of file + await operator.run_task() \ No newline at end of file diff --git a/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py b/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py index 064eff51..3ff2c846 100644 --- a/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py +++ b/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py @@ -6,7 +6,7 @@ from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import ValidationStatus, ValidationSource from collector_db.models import URLMetadata -from core.classes.URLRelevanceHuggingfaceCycler import URLRelevanceHuggingfaceCycler +from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from hugging_face.HuggingFaceInterface import HuggingFaceInterface @@ -38,11 +38,11 @@ def mock_get_url_relevancy( mock_hf_interface = MagicMock(spec=HuggingFaceInterface) mock_hf_interface.get_url_relevancy = mock_get_url_relevancy - cycler = URLRelevanceHuggingfaceCycler( + cycler = URLRelevanceHuggingfaceTaskOperator( adb_client=AsyncDatabaseClient(), huggingface_interface=mock_hf_interface ) - await cycler.cycle() + await cycler.run_task() results = await db_data_creator.adb_client.get_all(URLMetadata) From f861bd6d9d6a0c2a2da0beabacb96daf721a1436 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 27 Jan 2025 12:36:57 -0500 Subject: [PATCH 014/182] Add precise test data --- .../test_deepseek_record_classifier.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index 48692023..3396018d 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -9,15 +9,11 @@ async def test_deepseek_record_classifier(): from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType as hct d = { - hct.TITLE: "test title", - hct.DESCRIPTION: "test description", - hct.H1: "test h1", - hct.H2: "test h2", - hct.H3: "test h3", - hct.H4: "test h4", - hct.H5: "test h5", - hct.H6: "test h6", - hct.DIV: "test div", + hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", + hct.DESCRIPTION: "At the Thursday, November 2 regular city council meeting, Chief Evans administered the oath of office and swearing in of Corporal Cody Lumpkin. Corporal Lumpkin was surrounded by his family and members of the Acworth Police Department for the occasion. Corporal Lumpkin began employment with the Acworth Police Department on June 8,", + hct.H3: ["Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police"], + hct.H4: ["Share this on Social Media"], + hct.DIV: "PHONE DIRECTORY RESOURCES Search for: Search Button NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police Published On: November 3, 2023 At the Thursday, November 2 regular city council meeting, Chief Evans administered the oath of office and swearing in of Corporal Cody Lumpkin.  Corporal Lumpkin was surrounded by his family and members of the Acworth Police Department for the occasion.  Corporal Lumpkin began employment with the Acworth Police Department on June 8 , 2015, and has served as a patrol officer in addition to time spent time in Special Operations prior to his recent promotion. Share this on Social Media 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2025 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Peak | Laserfiche | Login ", } content_infos = [] for k, v in d.items(): From 593b21991a3f99e4355274aafce46ae856f8818d Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 27 Jan 2025 13:59:03 -0500 Subject: [PATCH 015/182] Draft work --- ...c732a_add_task_tables_and_linking_logic.py | 52 +++++++++++++++++ collector_db/models.py | 57 ++++++++++++++++++- tests/test_alembic/helpers.py | 8 ++- tests/test_alembic/test_revisions.py | 14 ++++- .../collector_db/test_database_structure.py | 9 ++- 5 files changed, 134 insertions(+), 6 deletions(-) create mode 100644 collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py diff --git a/collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py b/collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py new file mode 100644 index 00000000..2061ef45 --- /dev/null +++ b/collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py @@ -0,0 +1,52 @@ +"""Add Task Tables and linking logic + +Revision ID: b0e34cec732a +Revises: dae00e5aa8dd +Create Date: 2025-01-27 13:22:49.620212 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'b0e34cec732a' +down_revision: Union[str, None] = 'dae00e5aa8dd' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table('tasks', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('task_type', sa.String(), nullable=False), + sa.Column('task_id', sa.String(), nullable=False), + sa.Column('task_status', sa.String(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('task_type', 'task_id', name='uq_task_type_task_id') + ) + op.create_table('task_errors', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('task_id', sa.Integer(), nullable=False), + sa.Column('error', sa.Text(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['task_id'], ['tasks.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('link_task_urls', + sa.Column('task_id', sa.Integer(), nullable=False), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.ForeignKeyConstraint(['task_id'], ['tasks.id'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('task_id', 'url_id'), + sa.UniqueConstraint('task_id', 'url_id', name='uq_task_id_url_id') + ) + + +def downgrade() -> None: + op.drop_table('link_task_urls') + op.drop_table('task_errors') + op.drop_table('tasks') diff --git a/collector_db/models.py b/collector_db/models.py index 03837a0b..05578dd2 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -86,6 +86,11 @@ class URL(Base): url_metadata = relationship("URLMetadata", back_populates="url", cascade="all, delete-orphan") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") error_info = relationship("URLErrorInfo", back_populates="url", cascade="all, delete-orphan") + tasks = relationship( + "Task", + secondary="link_task_urls", + back_populates="url", + ) # URL Metadata table definition @@ -158,10 +163,11 @@ class URLErrorInfo(Base): url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) error = Column(Text, nullable=False) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) - # TODO: Add Info on Cycle the task occurred in + task_id = Column(Integer, ForeignKey('tasks.id'), nullable=True) # Relationships url = relationship("URL", back_populates="error_info") + task = relationship("Task", back_populates="errored_urls") class URLHTMLContent(Base): __tablename__ = 'url_html_content' @@ -232,3 +238,52 @@ class Missing(Base): # Relationships batch = relationship("Batch", back_populates="missings") + +class Task(Base): + __tablename__ = 'tasks' + __table_args__ = (UniqueConstraint( + "task_type", + "task_id", + name="uq_task_type_task_id"), + ) + + id = Column(Integer, primary_key=True) + task_type = Column(String, nullable=False) + task_id = Column(String, nullable=False) + task_status = Column(String, nullable=False) + updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + + # Relationships + urls = relationship( + "URL", + secondary="link_task_urls", + back_populates="task" + ) + task_errors = relationship("TaskError", back_populates="task") + errored_urls = relationship("URLErrorInfo", back_populates="task") + +class LinkTaskURL(Base): + __tablename__ = 'link_task_urls' + __table_args__ = (UniqueConstraint( + "task_id", + "url_id", + name="uq_task_id_url_id"), + ) + + task_id = Column(Integer, ForeignKey('tasks.id', ondelete="CASCADE"), primary_key=True) + url_id = Column(Integer, ForeignKey('urls.id', ondelete="CASCADE"), primary_key=True) + + # Relationships + task = relationship("Task", back_populates="link_task_urls") + url = relationship("URL", back_populates="link_task_urls") + +class TaskError(Base): + __tablename__ = 'task_errors' + + id = Column(Integer, primary_key=True) + task_id = Column(Integer, ForeignKey('tasks.id', ondelete="CASCADE"), nullable=False) + error = Column(Text, nullable=False) + updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + + # Relationships + task = relationship("Task", back_populates="task_errors") \ No newline at end of file diff --git a/tests/test_alembic/helpers.py b/tests/test_alembic/helpers.py index d66854f2..098ee1df 100644 --- a/tests/test_alembic/helpers.py +++ b/tests/test_alembic/helpers.py @@ -9,12 +9,14 @@ def get_enum_values(enum_name: str, session: Session) -> list[str]: def table_creation_check( alembic_runner: AlembicRunner, - table_name: str, + tables: list[str], start_revision: str, end_revision: str ): alembic_runner.upgrade(start_revision) - assert table_name not in alembic_runner.inspector.get_table_names() + for table_name in tables: + assert table_name not in alembic_runner.inspector.get_table_names() alembic_runner.upgrade(end_revision) alembic_runner.reflect() - assert table_name in alembic_runner.inspector.get_table_names() \ No newline at end of file + for table_name in tables: + assert table_name in alembic_runner.inspector.get_table_names() \ No newline at end of file diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 75df5f0c..95684ce2 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -298,7 +298,19 @@ def test_add_in_label_studio_metadata_status(alembic_runner): def test_create_metadata_annotation_table(alembic_runner): table_creation_check( alembic_runner, - "metadata_annotations", + ["metadata_annotations"], start_revision="108dac321086", end_revision="dcd158092de0" + ) + +def test_add_task_tables_and_linking_logic(alembic_runner): + table_creation_check( + alembic_runner, + tables=[ + "tasks", + "task_errors", + "link_task_urls" + ], + start_revision="dcd158092de0", + end_revision="b0e34cec732a" ) \ No newline at end of file diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 926a6ed8..51d9a918 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -325,4 +325,11 @@ def test_root_url(db_data_creator: DBDataCreator): engine=db_data_creator.db_client.engine ) - table_tester.run_column_tests() \ No newline at end of file + table_tester.run_column_tests() + +def test_task_url_links(db_data_creator: DBDataCreator): + # Create URLs + + # Create task + + # Associate URLs with task From 990cf3bc546999584e0e19af94ded68f9a012969 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 27 Jan 2025 15:38:47 -0500 Subject: [PATCH 016/182] Refine DeepSeekRecordClassifier * Add relevant environment variable to ENV.md * Fix bug in manual DeepSeekRecordClassifier test * Update DeepSeekRecordClassifier.py to pass test * Refine URLHTMLContentInfo to include list of strings --- ENV.md | 1 + collector_db/DTOs/URLHTMLContentInfo.py | 2 +- llm_api_logic/DeepSeekRecordClassifier.py | 6 +++++- .../manual/llm_api_logic/test_deepseek_record_classifier.py | 4 ++-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ENV.md b/ENV.md index a8210fb9..943ad293 100644 --- a/ENV.md +++ b/ENV.md @@ -16,3 +16,4 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`POSTGRES_PORT` | The port for the test database | `5432` | |`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token that is used in the Data Sources App for encoding. |`abc123`| |`DEV`| Set to any value to run the application in development mode. |`true`| +|'DEEPSEEK_API_KEY'| The API key required for accessing the DeepSeek API. |`abc123`| diff --git a/collector_db/DTOs/URLHTMLContentInfo.py b/collector_db/DTOs/URLHTMLContentInfo.py index ffd82724..f8b24eb0 100644 --- a/collector_db/DTOs/URLHTMLContentInfo.py +++ b/collector_db/DTOs/URLHTMLContentInfo.py @@ -18,4 +18,4 @@ class HTMLContentType(Enum): class URLHTMLContentInfo(BaseModel): url_id: Optional[int] = None content_type: HTMLContentType - content: str \ No newline at end of file + content: str | list[str] \ No newline at end of file diff --git a/llm_api_logic/DeepSeekRecordClassifier.py b/llm_api_logic/DeepSeekRecordClassifier.py index 1348b358..5a2067e0 100644 --- a/llm_api_logic/DeepSeekRecordClassifier.py +++ b/llm_api_logic/DeepSeekRecordClassifier.py @@ -1,3 +1,4 @@ +import json import os from openai import AsyncOpenAI @@ -91,4 +92,7 @@ async def classify_url(self, content_infos: list[URLHTMLContentInfo]) -> RecordT 'type': 'json_object' } ) - return RecordType(response["choices"][0]["message"]["content"]["record_type"]) + result_str = response.choices[0].message.content + + result_dict = json.loads(result_str) + return result_dict["record_type"] diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index 3396018d..b0a6c1fb 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -16,8 +16,8 @@ async def test_deepseek_record_classifier(): hct.DIV: "PHONE DIRECTORY RESOURCES Search for: Search Button NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police Published On: November 3, 2023 At the Thursday, November 2 regular city council meeting, Chief Evans administered the oath of office and swearing in of Corporal Cody Lumpkin.  Corporal Lumpkin was surrounded by his family and members of the Acworth Police Department for the occasion.  Corporal Lumpkin began employment with the Acworth Police Department on June 8 , 2015, and has served as a patrol officer in addition to time spent time in Special Operations prior to his recent promotion. Share this on Social Media 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2025 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Peak | Laserfiche | Login ", } content_infos = [] - for k, v in d.items(): - content_info = URLHTMLContentInfo(content_type=k, content=v) + for content_type, value in d.items(): + content_info = URLHTMLContentInfo(content_type=content_type, content=value) content_infos.append(content_info) classifier = DeepSeekRecordClassifier() From 6a30a43348f8d8215d3f5ebeb6df2585a6762984 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 28 Jan 2025 18:26:35 -0500 Subject: [PATCH 017/182] Remove unused files --- core/DTOs/LabelStudioExportResponseInfo.py | 9 --------- core/DTOs/LabelStudioTaskInfo.py | 11 ----------- hugging_face/URLClassifier.py | 10 ---------- 3 files changed, 30 deletions(-) delete mode 100644 core/DTOs/LabelStudioExportResponseInfo.py delete mode 100644 core/DTOs/LabelStudioTaskInfo.py delete mode 100644 hugging_face/URLClassifier.py diff --git a/core/DTOs/LabelStudioExportResponseInfo.py b/core/DTOs/LabelStudioExportResponseInfo.py deleted file mode 100644 index fae94096..00000000 --- a/core/DTOs/LabelStudioExportResponseInfo.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Annotated - -from fastapi.param_functions import Doc -from pydantic import BaseModel - - -class LabelStudioExportResponseInfo(BaseModel): - label_studio_import_id: Annotated[int, Doc("The ID of the Label Studio import")] - num_urls_imported: Annotated[int, Doc("The number of URLs imported")] \ No newline at end of file diff --git a/core/DTOs/LabelStudioTaskInfo.py b/core/DTOs/LabelStudioTaskInfo.py deleted file mode 100644 index 5c277c8a..00000000 --- a/core/DTOs/LabelStudioTaskInfo.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from collector_db.enums import URLMetadataAttributeType -from core.enums import LabelStudioTaskStatus - - -class LabelStudioTaskInfo(BaseModel): - metadata_id: int - attribute: URLMetadataAttributeType - task_id: int - task_status: LabelStudioTaskStatus \ No newline at end of file diff --git a/hugging_face/URLClassifier.py b/hugging_face/URLClassifier.py deleted file mode 100644 index 04380645..00000000 --- a/hugging_face/URLClassifier.py +++ /dev/null @@ -1,10 +0,0 @@ -from multimodal_transformers.model import DistilBertWithTabular -from transformers import AutoTokenizer - - -class URLClassifier: - - def __init__(self): - self.tokenizer = AutoTokenizer.from_pretrained("PDAP/url-classifier") - self.model = DistilBertWithTabular.from_pretrained("PDAP/url-classifier") - self.model.eval() From 1e502a30722477d16bc37eb3c799a57e0b190e24 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 28 Jan 2025 18:28:48 -0500 Subject: [PATCH 018/182] Implement draft of Record Type Classifier: * Refine Task Operators to follow unified logic, including logging * Add tables related to tracking tasks and errors/urls within tasks * Add new tests, update and refine existing tests * Refactor some database client logic --- api/main.py | 18 +- collector_db/AsyncDatabaseClient.py | 278 +++++++++++++----- collector_db/DTOs/TaskInfo.py | 18 ++ collector_db/DTOs/URLErrorInfos.py | 1 + collector_db/StatementComposer.py | 43 +++ ...5b1c_add_task_tables_and_linking_logic.py} | 42 ++- collector_db/enums.py | 1 - collector_db/models.py | 43 +-- core/AsyncCore.py | 21 ++ core/DTOs/GetTasksResponse.py | 19 ++ core/DTOs/task_data_objects/README.md | 1 + .../URLRecordTypeTDO.py} | 8 +- .../URLRelevanceHuggingfaceTDO.py} | 2 +- .../UrlHtmlTDO.py} | 2 +- core/classes/TaskOperatorBase.py | 64 ++++ core/classes/URLHTMLTaskOperator.py | 100 ++++--- core/classes/URLRecordTypeTaskOperator.py | 77 ++++- .../URLRelevanceHuggingfaceTaskOperator.py | 52 ++-- html_tag_collector/RootURLCache.py | 2 +- hugging_face/HuggingFaceInterface.py | 2 - local_database/DataDumper/docker-compose.yml | 4 +- tests/helpers/DBDataCreator.py | 19 +- tests/helpers/assert_functions.py | 7 + .../lifecycle/test_auto_googler_lifecycle.py | 2 +- .../core/lifecycle/test_ckan_lifecycle.py | 2 +- .../lifecycle/test_muckrock_lifecycles.py | 4 +- .../test_muckrock_collectors.py | 2 +- tests/test_alembic/helpers.py | 22 +- tests/test_alembic/test_revisions.py | 15 +- .../api/helpers/RequestValidator.py | 38 ++- .../integration/api/test_task.py | 41 +++ .../collector_db/test_database_structure.py | 7 - .../collector_db/test_db_client.py | 6 +- .../integration/tasks/__init__.py | 0 .../integration/tasks/test_example_task.py | 87 ++++++ .../integration/tasks/test_url_html_task.py | 105 +++++++ .../tasks/test_url_record_type_task.py | 47 +++ .../test_url_relevancy_huggingface_task.py} | 25 +- 38 files changed, 999 insertions(+), 228 deletions(-) create mode 100644 collector_db/DTOs/TaskInfo.py create mode 100644 collector_db/StatementComposer.py rename collector_db/alembic/versions/{b0e34cec732a_add_task_tables_and_linking_logic.py => 072b32a45b1c_add_task_tables_and_linking_logic.py} (57%) create mode 100644 core/DTOs/GetTasksResponse.py create mode 100644 core/DTOs/task_data_objects/README.md rename core/DTOs/{URLRecordTypeTaskInfo.py => task_data_objects/URLRecordTypeTDO.py} (50%) rename core/DTOs/{URLRelevanceHuggingfaceTaskInfo.py => task_data_objects/URLRelevanceHuggingfaceTDO.py} (78%) rename core/DTOs/{URLHTMLTaskInfo.py => task_data_objects/UrlHtmlTDO.py} (94%) create mode 100644 core/classes/TaskOperatorBase.py create mode 100644 tests/helpers/assert_functions.py create mode 100644 tests/test_automated/integration/api/test_task.py create mode 100644 tests/test_automated/integration/tasks/__init__.py create mode 100644 tests/test_automated/integration/tasks/test_example_task.py create mode 100644 tests/test_automated/integration/tasks/test_url_html_task.py create mode 100644 tests/test_automated/integration/tasks/test_url_record_type_task.py rename tests/test_automated/integration/{cycles/test_url_relevancy_huggingface_cycle.py => tasks/test_url_relevancy_huggingface_task.py} (82%) diff --git a/api/main.py b/api/main.py index 356467af..0a9c0249 100644 --- a/api/main.py +++ b/api/main.py @@ -6,6 +6,7 @@ from api.routes.batch import batch_router from api.routes.collector import collector_router from api.routes.root import root_router +from api.routes.task import task_router from api.routes.url import url_router from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient @@ -71,8 +72,15 @@ async def setup_database(db_client): lifespan=lifespan ) -app.include_router(root_router) -app.include_router(collector_router) -app.include_router(batch_router) -app.include_router(annotate_router) -app.include_router(url_router) \ No newline at end of file +routers = [ + root_router, + collector_router, + batch_router, + annotate_router, + url_router, + task_router +] +for router in routers: + app.include_router(router) + + diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index e83d10ca..07f1cc10 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,24 +1,31 @@ from functools import wraps +from typing import Optional -from sqlalchemy import select, exists +from sqlalchemy import select, exists, func from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.MetadataAnnotationInfo import MetadataAnnotationInfo +from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource +from collector_db.StatementComposer import StatementComposer +from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL + RootURL, Task, TaskError, LinkTaskURL from collector_manager.enums import URLStatus +from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.enums import BatchStatus + def add_standard_limit_and_offset(statement, page, limit=100): offset = (page - 1) * limit @@ -31,6 +38,14 @@ def __init__(self, db_url: str = get_postgres_connection_string(is_async=True)): echo=ConfigManager.get_sqlalchemy_echo(), ) self.session_maker = async_sessionmaker(bind=self.engine, expire_on_commit=False) + self.statement_composer = StatementComposer() + + @staticmethod + def _add_models(session: AsyncSession, model_class, models): + for model in models: + instance = model_class(**model.model_dump()) + session.add(instance) + @staticmethod def session_manager(method): @@ -66,14 +81,11 @@ async def get_url_metadata_by_status( @session_manager async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo): - url_metadata = URLMetadata(**url_metadata_info.model_dump()) - session.add(url_metadata) + self._add_models(session, URLMetadata, [url_metadata_info]) @session_manager async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]): - for url_metadata_info in url_metadata_infos: - url_metadata = URLMetadata(**url_metadata_info.model_dump()) - session.add(url_metadata) + self._add_models(session, URLMetadata, url_metadata_infos) @session_manager async def add_url_error_infos(self, session: AsyncSession, url_error_infos: list[URLErrorPydanticInfo]): @@ -88,34 +100,39 @@ async def add_url_error_infos(self, session: AsyncSession, url_error_infos: list @session_manager async def get_urls_with_errors(self, session: AsyncSession) -> list[URLErrorPydanticInfo]: - statement = (select(URL, URLErrorInfo.error, URLErrorInfo.updated_at) + statement = (select(URL, URLErrorInfo.error, URLErrorInfo.updated_at, URLErrorInfo.task_id) .join(URLErrorInfo) .where(URL.outcome == URLStatus.ERROR.value) .order_by(URL.id)) scalar_result = await session.execute(statement) results = scalar_result.all() final_results = [] - for url, error, updated_at in results: - final_results.append(URLErrorPydanticInfo(url_id=url.id, error=error, updated_at=updated_at)) + for url, error, updated_at, task_id in results: + final_results.append(URLErrorPydanticInfo( + url_id=url.id, + error=error, + updated_at=updated_at, + task_id=task_id + )) return final_results @session_manager async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - for html_content_info in html_content_infos: - # Add HTML Content Info to database - db_html_content_info = URLHTMLContent(**html_content_info.model_dump()) - session.add(db_html_content_info) + self._add_models(session, URLHTMLContent, html_content_infos) + + @session_manager + async def has_pending_urls_without_html_data(self, session: AsyncSession) -> bool: + statement = self.statement_composer.pending_urls_without_html_data() + statement = statement.limit(1) + scalar_result = await session.scalars(statement) + return bool(scalar_result.first()) @session_manager async def get_pending_urls_without_html_data(self, session: AsyncSession): # TODO: Add test that includes some urls WITH html data. Check they're not returned - statement = (select(URL). - outerjoin(URLHTMLContent). - where(URLHTMLContent.id == None). - where(URL.outcome == URLStatus.PENDING.value). - limit(100). - order_by(URL.id)) + statement = self.statement_composer.pending_urls_without_html_data() + statement = statement.limit(100).order_by(URL.id) scalar_result = await session.scalars(statement) return scalar_result.all() @@ -124,26 +141,17 @@ async def get_urls_with_html_data_and_without_metadata_type( self, session: AsyncSession, without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT - ): + ) -> list[URLWithHTML]: - # TODO: Generalize this so that it can exclude based on other attributes # Get URLs with no relevancy metadata statement = (select(URL.id, URL.url, URLHTMLContent). join(URLHTMLContent). - where(URL.outcome == URLStatus.PENDING.value) - # No relevancy metadata - .where( - ~exists( - select(URLMetadata.id). - where( - URLMetadata.url_id == URL.id, - URLMetadata.attribute == without_metadata_type.value - ) - ) - ) - .limit(100) - .order_by(URL.id) + where(URL.outcome == URLStatus.PENDING.value)) + statement = self.statement_composer.exclude_urls_with_select_metadata( + statement=statement, + attribute=without_metadata_type ) + statement = statement.limit(100).order_by(URL.id) raw_result = await session.execute(statement) result = raw_result.all() url_ids_to_urls = {url_id: url for url_id, url, _ in result} @@ -167,49 +175,24 @@ async def get_urls_with_html_data_and_without_metadata_type( return final_results @session_manager - async def get_urls_with_html_data_and_no_relevancy_metadata( - self, - session: AsyncSession - ) -> list[URLWithHTML]: + async def has_pending_urls_with_html_data_and_without_metadata_type( + self, + session: AsyncSession, + without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT + ) -> bool: # TODO: Generalize this so that it can exclude based on other attributes # Get URLs with no relevancy metadata statement = (select(URL.id, URL.url, URLHTMLContent). join(URLHTMLContent). - where(URL.outcome == URLStatus.PENDING.value) - # No relevancy metadata - .where( - ~exists( - select(URLMetadata.id). - where( - URLMetadata.url_id == URL.id, - URLMetadata.attribute == URLMetadataAttributeType.RELEVANT.value - ) - ) - ) - .limit(100) - .order_by(URL.id) + where(URL.outcome == URLStatus.PENDING.value)) + statement = self.statement_composer.exclude_urls_with_select_metadata( + statement=statement, + attribute=without_metadata_type ) + statement = statement.limit(1) raw_result = await session.execute(statement) result = raw_result.all() - url_ids_to_urls = {url_id: url for url_id, url, _ in result} - url_ids_to_html_info = {url_id: [] for url_id, _, _ in result} - - for url_id, _, html_info in result: - url_ids_to_html_info[url_id].append( - URLHTMLContentInfo(**html_info.__dict__) - ) - - final_results = [] - for url_id, url in url_ids_to_urls.items(): - url_with_html = URLWithHTML( - url_id=url_id, - url=url, - html_infos=url_ids_to_html_info[url_id] - ) - final_results.append(url_with_html) - - - return final_results + return len(result) > 0 @session_manager async def get_urls_with_metadata( @@ -425,5 +408,156 @@ async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetU count=len(final_results) ) + @session_manager + async def initiate_task( + self, + session: AsyncSession, + task_type: TaskType + ) -> int: + # Create Task + task = Task( + task_type=task_type, + task_status=BatchStatus.IN_PROCESS.value + ) + session.add(task) + # Return Task ID + await session.flush() + await session.refresh(task) + return task.id + @session_manager + async def update_task_status(self, session: AsyncSession, task_id: int, status: BatchStatus): + task = await session.get(Task, task_id) + task.task_status = status.value + await session.commit() + + @session_manager + async def add_task_error(self, session: AsyncSession, task_id: int, error: str): + task_error = TaskError( + task_id=task_id, + error=error + ) + session.add(task_error) + await session.commit() + + @session_manager + async def get_task_info(self, session: AsyncSession, task_id: int) -> TaskInfo: + # Get Task + result = await session.execute( + select(Task) + .where(Task.id == task_id) + .options( + selectinload(Task.urls), + selectinload(Task.error), + selectinload(Task.errored_urls) + ) + ) + task = result.scalars().first() + error = task.error[0].error if len(task.error) > 0 else None + # Get error info if any + # Get URLs + urls = task.urls + url_infos = [] + for url in urls: + url_info = URLInfo( + id=url.id, + batch_id=url.batch_id, + url=url.url, + collector_metadata=url.collector_metadata, + outcome=URLStatus(url.outcome), + updated_at=url.updated_at + ) + url_infos.append(url_info) + + errored_urls = [] + for url in task.errored_urls: + url_error_info = URLErrorPydanticInfo( + task_id=url.task_id, + url_id=url.url_id, + error=url.error, + updated_at=url.updated_at + ) + errored_urls.append(url_error_info) + return TaskInfo( + task_type=TaskType(task.task_type), + task_status=BatchStatus(task.task_status), + error_info=error, + updated_at=task.updated_at, + urls=url_infos, + url_errors=errored_urls + ) + + @session_manager + async def get_html_content_info(self, session: AsyncSession, url_id: int) -> list[URLHTMLContentInfo]: + session_result = await session.execute( + select(URLHTMLContent) + .where(URLHTMLContent.url_id == url_id) + ) + results = session_result.scalars().all() + return [URLHTMLContentInfo(**result.__dict__) for result in results] + + + + @session_manager + async def link_urls_to_task(self, session: AsyncSession, task_id: int, url_ids: list[int]): + for url_id in url_ids: + link = LinkTaskURL( + url_id=url_id, + task_id=task_id + ) + session.add(link) + + @session_manager + async def get_tasks( + self, + session: AsyncSession, + task_type: Optional[TaskType] = None, + task_status: Optional[BatchStatus] = None, + page: int = 1 + ) -> GetTasksResponse: + url_count_subquery = self.statement_composer.simple_count_subquery( + LinkTaskURL, + 'task_id', + 'url_count' + ) + + url_error_count_subquery = self.statement_composer.simple_count_subquery( + URLErrorInfo, + 'task_id', + 'url_error_count' + ) + statement = select( + Task, + url_count_subquery.c.url_count, + url_error_count_subquery.c.url_error_count + ).outerjoin( + url_count_subquery, + Task.id == url_count_subquery.c.task_id + ).outerjoin( + url_error_count_subquery, + Task.id == url_error_count_subquery.c.task_id + ) + if task_type is not None: + statement = statement.where(Task.task_type == task_type.value) + if task_status is not None: + statement = statement.where(Task.task_status == task_status.value) + add_standard_limit_and_offset(statement, page) + + execute_result = await session.execute(statement) + all_results = execute_result.all() + final_results = [] + for task, url_count, url_error_count in all_results: + final_results.append( + GetTasksResponseTaskInfo( + task_id=task.id, + type=TaskType(task.task_type), + status=BatchStatus(task.task_status), + url_count=url_count if url_count is not None else 0, + url_error_count=url_error_count if url_error_count is not None else 0, + updated_at=task.updated_at + ) + ) + return GetTasksResponse( + tasks=final_results + ) diff --git a/collector_db/DTOs/TaskInfo.py b/collector_db/DTOs/TaskInfo.py new file mode 100644 index 00000000..e8d8090d --- /dev/null +++ b/collector_db/DTOs/TaskInfo.py @@ -0,0 +1,18 @@ +import datetime +from typing import Optional + +from pydantic import BaseModel + +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo +from collector_db.DTOs.URLInfo import URLInfo +from collector_db.enums import TaskType +from core.enums import BatchStatus + + +class TaskInfo(BaseModel): + task_type: TaskType + task_status: BatchStatus + updated_at: datetime.datetime + error_info: Optional[str] = None + urls: list[URLInfo] + url_errors: list[URLErrorPydanticInfo] \ No newline at end of file diff --git a/collector_db/DTOs/URLErrorInfos.py b/collector_db/DTOs/URLErrorInfos.py index cf73a6dc..46f5b9fa 100644 --- a/collector_db/DTOs/URLErrorInfos.py +++ b/collector_db/DTOs/URLErrorInfos.py @@ -5,6 +5,7 @@ class URLErrorPydanticInfo(BaseModel): + task_id: int url_id: int error: str updated_at: Optional[datetime.datetime] = None \ No newline at end of file diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py new file mode 100644 index 00000000..dc756fb3 --- /dev/null +++ b/collector_db/StatementComposer.py @@ -0,0 +1,43 @@ + +from sqlalchemy import Select, select, exists, Table, func, Subquery + +from collector_db.enums import URLMetadataAttributeType +from collector_db.models import URL, URLHTMLContent, URLMetadata +from collector_manager.enums import URLStatus + + +class StatementComposer: + """ + Assists in the composition of SQLAlchemy statements + """ + + @staticmethod + def pending_urls_without_html_data() -> Select: + return (select(URL). + outerjoin(URLHTMLContent). + where(URLHTMLContent.id == None). + where(URL.outcome == URLStatus.PENDING.value)) + + @staticmethod + def exclude_urls_with_select_metadata( + statement: Select, + attribute: URLMetadataAttributeType + ) -> Select: + return (statement.where( + ~exists( + select(URLMetadata.id). + where( + URLMetadata.url_id == URL.id, + URLMetadata.attribute == attribute.value + ) + ) + )) + + @staticmethod + def simple_count_subquery(model, attribute: str, label: str) -> Subquery: + attr_value = getattr(model, attribute) + return select( + attr_value, + func.count(attr_value).label(label) + ).group_by(attr_value).subquery() + diff --git a/collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py similarity index 57% rename from collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py rename to collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py index 2061ef45..dcae164b 100644 --- a/collector_db/alembic/versions/b0e34cec732a_add_task_tables_and_linking_logic.py +++ b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py @@ -1,32 +1,46 @@ """Add Task Tables and linking logic -Revision ID: b0e34cec732a +Revision ID: 072b32a45b1c Revises: dae00e5aa8dd -Create Date: 2025-01-27 13:22:49.620212 +Create Date: 2025-01-27 15:48:02.713484 """ from typing import Sequence, Union from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects import postgresql + +from collector_db.enums import PGEnum # revision identifiers, used by Alembic. -revision: str = 'b0e34cec732a' +revision: str = '072b32a45b1c' down_revision: Union[str, None] = 'dae00e5aa8dd' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +task_type = PGEnum( + 'HTML', + 'Relevancy', + 'Record Type', + name='task_type', +) + def upgrade() -> None: op.create_table('tasks', sa.Column('id', sa.Integer(), nullable=False), - sa.Column('task_type', sa.String(), nullable=False), - sa.Column('task_id', sa.String(), nullable=False), - sa.Column('task_status', sa.String(), nullable=False), + sa.Column('task_type', task_type, nullable=False), + sa.Column( + 'task_status', + PGEnum( + 'complete', 'error', 'in-process', 'aborted', + name='batch_status', + create_type=False + ), + nullable=False + ), sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('task_type', 'task_id', name='uq_task_type_task_id') ) op.create_table('task_errors', sa.Column('id', sa.Integer(), nullable=False), @@ -44,9 +58,21 @@ def upgrade() -> None: sa.PrimaryKeyConstraint('task_id', 'url_id'), sa.UniqueConstraint('task_id', 'url_id', name='uq_task_id_url_id') ) + # Change to URL Error Info requires deleting prior data + op.execute("DELETE FROM url_error_info;") + + op.add_column('url_error_info', sa.Column('task_id', sa.Integer(), nullable=False)) + op.create_unique_constraint('uq_url_id_error', 'url_error_info', ['url_id', 'task_id']) + op.create_foreign_key("fk_url_error_info_task", 'url_error_info', 'tasks', ['task_id'], ['id']) def downgrade() -> None: + + op.drop_constraint("fk_url_error_info_task", 'url_error_info', type_='foreignkey') + op.drop_constraint('uq_url_id_error', 'url_error_info', type_='unique') + op.drop_column('url_error_info', 'task_id') op.drop_table('link_task_urls') op.drop_table('task_errors') op.drop_table('tasks') + + task_type.drop(op.get_bind(), checkfirst=True) diff --git a/collector_db/enums.py b/collector_db/enums.py index 66734a9c..a6f3c95e 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -37,7 +37,6 @@ class TaskType(PyEnum): RELEVANCY = "Relevancy" RECORD_TYPE = "Record Type" - class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/collector_db/models.py b/collector_db/models.py index 05578dd2..f1eac526 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -16,6 +16,7 @@ CURRENT_TIME_SERVER_DEFAULT = func.now() +batch_status_enum = PGEnum('complete', 'error', 'in-process', 'aborted', name='batch_status') class Batch(Base): __tablename__ = 'batches' @@ -29,9 +30,7 @@ class Batch(Base): user_id = Column(Integer, nullable=False) # Gives the status of the batch status = Column( - postgresql.ENUM( - 'complete', 'error', 'in-process', 'aborted', - name='batch_status'), + batch_status_enum, nullable=False ) # The number of URLs in the batch @@ -89,7 +88,7 @@ class URL(Base): tasks = relationship( "Task", secondary="link_task_urls", - back_populates="url", + back_populates="urls", ) @@ -158,12 +157,17 @@ class RootURL(Base): class URLErrorInfo(Base): __tablename__ = 'url_error_info' + __table_args__ = (UniqueConstraint( + "url_id", + "task_id", + name="uq_url_id_error"), + ) id = Column(Integer, primary_key=True) url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) error = Column(Text, nullable=False) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) - task_id = Column(Integer, ForeignKey('tasks.id'), nullable=True) + task_id = Column(Integer, ForeignKey('tasks.id'), nullable=False) # Relationships url = relationship("URL", back_populates="error_info") @@ -241,25 +245,22 @@ class Missing(Base): class Task(Base): __tablename__ = 'tasks' - __table_args__ = (UniqueConstraint( - "task_type", - "task_id", - name="uq_task_type_task_id"), - ) id = Column(Integer, primary_key=True) - task_type = Column(String, nullable=False) - task_id = Column(String, nullable=False) - task_status = Column(String, nullable=False) + task_type = Column( + PGEnum( + 'HTML', 'Relevancy', 'Record Type', name='task_type' + ), nullable=False) + task_status = Column(batch_status_enum, nullable=False) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) # Relationships urls = relationship( "URL", secondary="link_task_urls", - back_populates="task" + back_populates="tasks" ) - task_errors = relationship("TaskError", back_populates="task") + error = relationship("TaskError", back_populates="task") errored_urls = relationship("URLErrorInfo", back_populates="task") class LinkTaskURL(Base): @@ -273,10 +274,6 @@ class LinkTaskURL(Base): task_id = Column(Integer, ForeignKey('tasks.id', ondelete="CASCADE"), primary_key=True) url_id = Column(Integer, ForeignKey('urls.id', ondelete="CASCADE"), primary_key=True) - # Relationships - task = relationship("Task", back_populates="link_task_urls") - url = relationship("URL", back_populates="link_task_urls") - class TaskError(Base): __tablename__ = 'task_errors' @@ -286,4 +283,10 @@ class TaskError(Base): updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) # Relationships - task = relationship("Task", back_populates="task_errors") \ No newline at end of file + task = relationship("Task", back_populates="error") + + __table_args__ = (UniqueConstraint( + "task_id", + "error", + name="uq_task_id_error"), + ) \ No newline at end of file diff --git a/core/AsyncCore.py b/core/AsyncCore.py index de70abd2..5318a044 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,17 +1,23 @@ import logging from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo +from collector_db.enums import TaskType from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from core.enums import BatchStatus from html_tag_collector.DataClassTags import convert_to_response_html_info from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface +from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier class AsyncCore: @@ -47,9 +53,18 @@ async def run_url_relevance_huggingface_task(self): ) await operator.run_task() + async def run_url_record_type_task(self): + self.logger.info("Running URL Record Type Task") + operator = URLRecordTypeTaskOperator( + adb_client=self.adb_client, + classifier=DeepSeekRecordClassifier() + ) + await operator.run_task() + async def run_tasks(self): await self.run_url_html_task() await self.run_url_relevance_huggingface_task() + await self.run_url_record_type_task() async def convert_to_relevance_annotation_request_info(self, url_info: URLAnnotationInfo) -> RelevanceAnnotationRequestInfo: response_html_info = convert_to_response_html_info( @@ -87,3 +102,9 @@ async def submit_url_relevance_annotation( async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: return await self.adb_client.get_urls(page=page, errors=errors) + + async def get_task_info(self, task_id: int) -> TaskInfo: + return await self.adb_client.get_task_info(task_id=task_id) + + async def get_tasks(self, page: int, task_type: TaskType, task_status: BatchStatus) -> GetTasksResponse: + return await self.adb_client.get_tasks(page=page, task_type=task_type, task_status=task_status) diff --git a/core/DTOs/GetTasksResponse.py b/core/DTOs/GetTasksResponse.py new file mode 100644 index 00000000..42b3d954 --- /dev/null +++ b/core/DTOs/GetTasksResponse.py @@ -0,0 +1,19 @@ +import datetime + +from pydantic import BaseModel + +from collector_db.enums import TaskType +from core.enums import BatchStatus + + +class GetTasksResponseTaskInfo(BaseModel): + task_id: int + type: TaskType + status: BatchStatus + url_count: int + url_error_count: int + updated_at: datetime.datetime + + +class GetTasksResponse(BaseModel): + tasks: list[GetTasksResponseTaskInfo] diff --git a/core/DTOs/task_data_objects/README.md b/core/DTOs/task_data_objects/README.md new file mode 100644 index 00000000..3d2fc5ae --- /dev/null +++ b/core/DTOs/task_data_objects/README.md @@ -0,0 +1 @@ +Task Data Objects (or TDOs) are data transfer objects (DTOs) used within a given task operation. Each Task type has one type of TDO. \ No newline at end of file diff --git a/core/DTOs/URLRecordTypeTaskInfo.py b/core/DTOs/task_data_objects/URLRecordTypeTDO.py similarity index 50% rename from core/DTOs/URLRecordTypeTaskInfo.py rename to core/DTOs/task_data_objects/URLRecordTypeTDO.py index 6c5d8ea7..34bbc233 100644 --- a/core/DTOs/URLRecordTypeTaskInfo.py +++ b/core/DTOs/task_data_objects/URLRecordTypeTDO.py @@ -6,6 +6,10 @@ from core.enums import RecordType -class URLRecordTypeTaskInfo(BaseModel): +class URLRecordTypeTDO(BaseModel): url_with_html: URLWithHTML - record_type: Optional[RecordType] = None \ No newline at end of file + record_type: Optional[RecordType] = None + error: Optional[str] = None + + def is_errored(self): + return self.error is not None \ No newline at end of file diff --git a/core/DTOs/URLRelevanceHuggingfaceTaskInfo.py b/core/DTOs/task_data_objects/URLRelevanceHuggingfaceTDO.py similarity index 78% rename from core/DTOs/URLRelevanceHuggingfaceTaskInfo.py rename to core/DTOs/task_data_objects/URLRelevanceHuggingfaceTDO.py index bb4553d1..33311a9b 100644 --- a/core/DTOs/URLRelevanceHuggingfaceTaskInfo.py +++ b/core/DTOs/task_data_objects/URLRelevanceHuggingfaceTDO.py @@ -5,6 +5,6 @@ from collector_db.DTOs.URLWithHTML import URLWithHTML -class URLRelevanceHuggingfaceTaskInfo(BaseModel): +class URLRelevanceHuggingfaceTDO(BaseModel): url_with_html: URLWithHTML relevant: Optional[bool] = None diff --git a/core/DTOs/URLHTMLTaskInfo.py b/core/DTOs/task_data_objects/UrlHtmlTDO.py similarity index 94% rename from core/DTOs/URLHTMLTaskInfo.py rename to core/DTOs/task_data_objects/UrlHtmlTDO.py index cff69e4f..05e9caf2 100644 --- a/core/DTOs/URLHTMLTaskInfo.py +++ b/core/DTOs/task_data_objects/UrlHtmlTDO.py @@ -7,7 +7,7 @@ @dataclass -class URLHTMLTaskInfo: +class UrlHtmlTDO: url_info: URLInfo url_response_info: Optional[URLResponseInfo] = None html_tag_info: Optional[ResponseHTMLInfo] = None diff --git a/core/classes/TaskOperatorBase.py b/core/classes/TaskOperatorBase.py new file mode 100644 index 00000000..6fd86c97 --- /dev/null +++ b/core/classes/TaskOperatorBase.py @@ -0,0 +1,64 @@ +from abc import ABC, abstractmethod + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import TaskType +from core.enums import BatchStatus + + +class TaskOperatorBase(ABC): + + def __init__(self, adb_client: AsyncDatabaseClient): + self.adb_client = adb_client + self.task_id = None + self.tasks_linked = False + + @property + @abstractmethod + def task_type(self) -> TaskType: + raise NotImplementedError + + @abstractmethod + async def meets_task_prerequisites(self): + """ + A task should not be initiated unless certain + conditions are met + """ + raise NotImplementedError + + async def link_urls_to_task(self, url_ids: list[int]): + await self.adb_client.link_urls_to_task(task_id=self.task_id, url_ids=url_ids) + self.tasks_linked = True + + async def initiate_task_in_db(self) -> int: + task_id = await self.adb_client.initiate_task( + task_type=self.task_type + ) + return task_id + + async def conclude_task_in_db(self): + if not self.tasks_linked: + raise Exception("Task has not been linked to any URLs") + await self.adb_client.update_task_status(task_id=self.task_id, status=BatchStatus.COMPLETE) + + async def run_task(self): + if not await self.meets_task_prerequisites(): + print(f"Task {self.task_type.value} does not meet prerequisites. Skipping...") + return + self.task_id = await self.initiate_task_in_db() + + try: + await self.inner_task_logic() + await self.conclude_task_in_db() + except Exception as e: + await self.handle_task_error(e) + + @abstractmethod + async def inner_task_logic(self): + raise NotImplementedError + + async def handle_task_error(self, e): + await self.adb_client.update_task_status(task_id=self.task_id, status=BatchStatus.ERROR) + await self.adb_client.add_task_error( + task_id=self.task_id, + error=str(e) + ) diff --git a/core/classes/URLHTMLTaskOperator.py b/core/classes/URLHTMLTaskOperator.py index 42c3e21a..63321635 100644 --- a/core/classes/URLHTMLTaskOperator.py +++ b/core/classes/URLHTMLTaskOperator.py @@ -1,93 +1,105 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo -from core.DTOs.URLHTMLTaskInfo import URLHTMLTaskInfo +from collector_db.enums import TaskType +from core.DTOs.task_data_objects.UrlHtmlTDO import UrlHtmlTDO from core.classes.HTMLContentInfoGetter import HTMLContentInfoGetter +from core.classes.TaskOperatorBase import TaskOperatorBase from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface -class URLHTMLTaskOperator: +class URLHTMLTaskOperator(TaskOperatorBase): def __init__( - self, - url_request_interface: URLRequestInterface, - adb_client: AsyncDatabaseClient, - html_parser: HTMLResponseParser + self, + url_request_interface: URLRequestInterface, + adb_client: AsyncDatabaseClient, + html_parser: HTMLResponseParser ): + super().__init__(adb_client) self.url_request_interface = url_request_interface - self.adb_client = adb_client self.html_parser = html_parser - async def run_task(self): + @property + def task_type(self): + return TaskType.HTML + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_without_html_data() + + async def inner_task_logic(self): print("Running URL HTML Task...") - task_infos = await self.get_pending_urls_without_html_data() - await self.get_raw_html_data_for_urls(task_infos) - success_subset, error_subset = await self.separate_success_and_error_subsets(task_infos) + tdos = await self.get_pending_urls_without_html_data() + url_ids = [task_info.url_info.id for task_info in tdos] + await self.link_urls_to_task(url_ids=url_ids) + await self.get_raw_html_data_for_urls(tdos) + success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) await self.update_errors_in_database(error_subset) await self.process_html_data(success_subset) await self.update_html_data_in_database(success_subset) - async def get_just_urls(self, task_infos: list[URLHTMLTaskInfo]): - return [task_info.url_info.url for task_info in task_infos] + async def get_just_urls(self, tdos: list[UrlHtmlTDO]): + return [task_info.url_info.url for task_info in tdos] async def get_pending_urls_without_html_data(self): pending_urls: list[URLInfo] = await self.adb_client.get_pending_urls_without_html_data() - task_infos = [ - URLHTMLTaskInfo( + tdos = [ + UrlHtmlTDO( url_info=url_info, ) for url_info in pending_urls ] - return task_infos + return tdos - async def get_raw_html_data_for_urls(self, task_infos: list[URLHTMLTaskInfo]): - just_urls = await self.get_just_urls(task_infos) + async def get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]): + just_urls = await self.get_just_urls(tdos) url_response_infos = await self.url_request_interface.make_requests(just_urls) - for task_info, url_response_info in zip(task_infos, url_response_infos): - task_info.url_response_info = url_response_info + for tdto, url_response_info in zip(tdos, url_response_infos): + tdto.url_response_info = url_response_info async def separate_success_and_error_subsets( self, - task_infos: list[URLHTMLTaskInfo] + tdos: list[UrlHtmlTDO] ) -> tuple[ - list[URLHTMLTaskInfo], # Successful - list[URLHTMLTaskInfo] # Error + list[UrlHtmlTDO], # Successful + list[UrlHtmlTDO] # Error ]: - errored_task_infos = [] - successful_task_infos = [] - for task_info in task_infos: - if not task_info.url_response_info.success: - errored_task_infos.append(task_info) + errored_tdos = [] + successful_tdos = [] + for tdto in tdos: + if not tdto.url_response_info.success: + errored_tdos.append(tdto) else: - successful_task_infos.append(task_info) - return successful_task_infos, errored_task_infos + successful_tdos.append(tdto) + return successful_tdos, errored_tdos - async def update_errors_in_database(self, errored_task_infos: list[URLHTMLTaskInfo]): + async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): error_infos = [] - for error_task_info in errored_task_infos: + for error_tdo in error_tdos: error_info = URLErrorPydanticInfo( - url_id=error_task_info.url_info.id, - error=str(error_task_info.url_response_info.exception), + task_id=self.task_id, + url_id=error_tdo.url_info.id, + error=str(error_tdo.url_response_info.exception), ) error_infos.append(error_info) await self.adb_client.add_url_error_infos(error_infos) - async def process_html_data(self, task_infos: list[URLHTMLTaskInfo]): - for task_info in task_infos: + async def process_html_data(self, tdos: list[UrlHtmlTDO]): + for tdto in tdos: html_tag_info = await self.html_parser.parse( - url=task_info.url_info.url, - html_content=task_info.url_response_info.html, - content_type=task_info.url_response_info.content_type + url=tdto.url_info.url, + html_content=tdto.url_response_info.html, + content_type=tdto.url_response_info.content_type ) - task_info.html_tag_info = html_tag_info + tdto.html_tag_info = html_tag_info - async def update_html_data_in_database(self, task_infos: list[URLHTMLTaskInfo]): + async def update_html_data_in_database(self, tdos: list[UrlHtmlTDO]): html_content_infos = [] - for task_info in task_infos: + for tdto in tdos: hcig = HTMLContentInfoGetter( - response_html_info=task_info.html_tag_info, - url_id=task_info.url_info.id + response_html_info=tdto.html_tag_info, + url_id=tdto.url_info.id ) results = hcig.get_all_html_content() html_content_infos.extend(results) diff --git a/core/classes/URLRecordTypeTaskOperator.py b/core/classes/URLRecordTypeTaskOperator.py index 3b1b27b2..18ac8ef4 100644 --- a/core/classes/URLRecordTypeTaskOperator.py +++ b/core/classes/URLRecordTypeTaskOperator.py @@ -1,24 +1,85 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.enums import URLMetadataAttributeType +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo +from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo +from collector_db.enums import URLMetadataAttributeType, TaskType, ValidationStatus, ValidationSource +from core.DTOs.task_data_objects.URLRecordTypeTDO import URLRecordTypeTDO +from core.classes.TaskOperatorBase import TaskOperatorBase +from core.enums import RecordType from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier -class URLRecordTypeTaskOperator: +class URLRecordTypeTaskOperator(TaskOperatorBase): def __init__( self, adb_client: AsyncDatabaseClient, classifier: DeepSeekRecordClassifier ): - self.adb_client = adb_client + super().__init__(adb_client) self.classifier = classifier - async def run_task(self): + @property + def task_type(self): + return TaskType.RECORD_TYPE + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_with_html_data_and_without_metadata_type( + without_metadata_type=URLMetadataAttributeType.RECORD_TYPE + ) + + async def get_tdos(self) -> list[URLRecordTypeTDO]: + urls_with_html = await self.adb_client.get_urls_with_html_data_and_without_metadata_type( + without_metadata_type=URLMetadataAttributeType.RECORD_TYPE + ) + tdos = [URLRecordTypeTDO(url_with_html=url_with_html) for url_with_html in urls_with_html] + return tdos + + async def inner_task_logic(self): # Get pending urls from Source Collector # with HTML data and without Record Type Metadata - task_infos = await self.adb_client.get_pending_urls_without_html_data( - without_metadata_attribute=URLMetadataAttributeType.RECORD_TYPE - ) + tdos = await self.get_tdos() + url_ids = [tdo.url_with_html.url_id for tdo in tdos] + await self.link_urls_to_task(url_ids=url_ids) + + await self.get_ml_classifications(tdos) + success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) + await self.put_results_into_database(success_subset) + await self.update_errors_in_database(error_subset) + + async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): + error_infos = [] + for tdo in tdos: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_with_html.url_id, + error=tdo.error + ) + error_infos.append(error_info) + await self.adb_client.add_url_error_infos(error_infos) + + async def put_results_into_database(self, tdos: list[URLRecordTypeTDO]): + url_metadatas = [] + for tdo in tdos: + url_metadata = URLMetadataInfo( + url_id=tdo.url_with_html.url_id, + attribute=URLMetadataAttributeType.RECORD_TYPE, + value=str(tdo.record_type), + validation_status=ValidationStatus.PENDING_VALIDATION, + validation_source=ValidationSource.MACHINE_LEARNING + ) + url_metadatas.append(url_metadata) + await self.adb_client.add_url_metadatas(url_metadatas) + + async def separate_success_and_error_subsets(self, tdos: list[URLRecordTypeTDO]): + success_subset = [tdo for tdo in tdos if not tdo.is_errored()] + error_subset = [tdo for tdo in tdos if tdo.is_errored()] + return success_subset, error_subset - async def get_ml_classifications(self, task_infos: list[URLRecordTypeTaskInfo]): \ No newline at end of file + async def get_ml_classifications(self, tdos: list[URLRecordTypeTDO]): + for tdo in tdos: + try: + record_type_str = await self.classifier.classify_url(tdo.url_with_html.html_infos) + tdo.record_type = RecordType(record_type_str) + except Exception as e: + tdo.error = str(e) \ No newline at end of file diff --git a/core/classes/URLRelevanceHuggingfaceTaskOperator.py b/core/classes/URLRelevanceHuggingfaceTaskOperator.py index 904adbe1..2d54a856 100644 --- a/core/classes/URLRelevanceHuggingfaceTaskOperator.py +++ b/core/classes/URLRelevanceHuggingfaceTaskOperator.py @@ -1,63 +1,73 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from core.DTOs.URLRelevanceHuggingfaceTaskInfo import URLRelevanceHuggingfaceTaskInfo +from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType +from core.DTOs.task_data_objects.URLRelevanceHuggingfaceTDO import URLRelevanceHuggingfaceTDO +from core.classes.TaskOperatorBase import TaskOperatorBase from hugging_face.HuggingFaceInterface import HuggingFaceInterface -class URLRelevanceHuggingfaceTaskOperator: +class URLRelevanceHuggingfaceTaskOperator(TaskOperatorBase): def __init__( self, adb_client: AsyncDatabaseClient, huggingface_interface: HuggingFaceInterface ): - self.adb_client = adb_client + super().__init__(adb_client) self.huggingface_interface = huggingface_interface - async def run_task(self): + @property + def task_type(self): + return TaskType.RELEVANCY + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_with_html_data_and_without_metadata_type() + + async def inner_task_logic(self): # Get pending urls from Source Collector # with HTML data and without Relevancy Metadata - task_infos = await self.get_pending_url_info( + tdos = await self.get_pending_url_info( without_metadata_attribute=URLMetadataAttributeType.RELEVANT ) + url_ids = [tdo.url_with_html.url_id for tdo in tdos] + await self.link_urls_to_task(url_ids=url_ids) # Pipe into Huggingface - await self.add_huggingface_relevancy(task_infos) + await self.add_huggingface_relevancy(tdos) # Put results into Database - await self.put_results_into_database(task_infos) + await self.put_results_into_database(tdos) - async def put_results_into_database(self, task_infos): + async def put_results_into_database(self, tdos): url_metadatas = [] - for task_info in task_infos: + for tdo in tdos: url_metadata = URLMetadataInfo( - url_id=task_info.url_with_html.url_id, + url_id=tdo.url_with_html.url_id, attribute=URLMetadataAttributeType.RELEVANT, - value=str(task_info.relevant), + value=str(tdo.relevant), validation_status=ValidationStatus.PENDING_VALIDATION, validation_source=ValidationSource.MACHINE_LEARNING ) url_metadatas.append(url_metadata) await self.adb_client.add_url_metadatas(url_metadatas) - async def add_huggingface_relevancy(self, task_infos: list[URLRelevanceHuggingfaceTaskInfo]): - urls_with_html = [task_info.url_with_html for task_info in task_infos] + async def add_huggingface_relevancy(self, tdos: list[URLRelevanceHuggingfaceTDO]): + urls_with_html = [tdo.url_with_html for tdo in tdos] results = self.huggingface_interface.get_url_relevancy(urls_with_html) - for task_info, result in zip(task_infos, results): - task_info.relevant = result + for tdo, result in zip(tdos, results): + tdo.relevant = result async def get_pending_url_info( self, without_metadata_attribute: URLMetadataAttributeType - ) -> list[URLRelevanceHuggingfaceTaskInfo]: - task_infos = [] + ) -> list[URLRelevanceHuggingfaceTDO]: + tdos = [] pending_urls: list[URLWithHTML] = await self.adb_client.get_urls_with_html_data_and_without_metadata_type( without_metadata_type=without_metadata_attribute ) for url_with_html in pending_urls: - task_info = URLRelevanceHuggingfaceTaskInfo( + tdo = URLRelevanceHuggingfaceTDO( url_with_html=url_with_html ) - task_infos.append(task_info) - return task_infos + tdos.append(tdo) + return tdos diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py index be670475..e306b6e1 100644 --- a/html_tag_collector/RootURLCache.py +++ b/html_tag_collector/RootURLCache.py @@ -26,7 +26,7 @@ async def save_to_cache(self, url: str, title: str): self.cache[url] = title await self.adb_client.add_to_root_url_cache(url=url, page_title=title) - async def get_from_cache(self, url: str): + async def get_from_cache(self, url: str) -> Optional[str]: if self.cache is None: self.cache = await self.adb_client.load_root_url_cache() diff --git a/hugging_face/HuggingFaceInterface.py b/hugging_face/HuggingFaceInterface.py index 2ea635d5..4e37e9c4 100644 --- a/hugging_face/HuggingFaceInterface.py +++ b/hugging_face/HuggingFaceInterface.py @@ -1,14 +1,12 @@ from transformers import pipeline from collector_db.DTOs.URLWithHTML import URLWithHTML -from hugging_face.URLClassifier import URLClassifier class HuggingFaceInterface: def __init__(self): self.relevance_pipe = pipeline("text-classification", model="PDAP/url-relevance") - self.url_classifier = URLClassifier() def get_url_relevancy( self, diff --git a/local_database/DataDumper/docker-compose.yml b/local_database/DataDumper/docker-compose.yml index f24c78b5..4a28c5e8 100644 --- a/local_database/DataDumper/docker-compose.yml +++ b/local_database/DataDumper/docker-compose.yml @@ -22,6 +22,6 @@ services: entrypoint: [ "bash", # Comment out one of the following lines depending on your needs -# "/usr/local/bin/dump.sh" - "/usr/local/bin/restore.sh" + "/usr/local/bin/dump.sh" +# "/usr/local/bin/restore.sh" ] \ No newline at end of file diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index c7fce247..c9c26846 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo @@ -9,7 +9,7 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DatabaseClient import DatabaseClient -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource +from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_manager.enums import CollectorType from core.enums import BatchStatus from tests.helpers.simple_test_data_functions import generate_test_urls @@ -34,6 +34,12 @@ def batch(self): ) ) + async def task(self, url_ids: Optional[list[int]] = None) -> int: + task_id = await self.adb_client.initiate_task(task_type=TaskType.HTML) + if url_ids is not None: + await self.adb_client.link_urls_to_task(task_id=task_id, url_ids=url_ids) + return task_id + def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: raw_urls = generate_test_urls(url_count) url_infos: List[URLInfo] = [] @@ -99,12 +105,19 @@ async def metadata( ) ) - async def error_info(self, url_ids: list[int]): + async def error_info( + self, + url_ids: list[int], + task_id: Optional[int] = None + ): + if task_id is None: + task_id = await self.task() error_infos = [] for url_id in url_ids: url_error_info = URLErrorPydanticInfo( url_id=url_id, error="test error", + task_id=task_id ) error_infos.append(url_error_info) await self.adb_client.add_url_error_infos(error_infos) diff --git a/tests/helpers/assert_functions.py b/tests/helpers/assert_functions.py new file mode 100644 index 00000000..ef379d3e --- /dev/null +++ b/tests/helpers/assert_functions.py @@ -0,0 +1,7 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.models import Task + + +async def assert_database_has_no_tasks(adb_client: AsyncDatabaseClient): + tasks = await adb_client.get_all(Task) + assert len(tasks) == 0 \ No newline at end of file diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index 2489d17f..c962e1e7 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -1,12 +1,12 @@ import os import dotenv -from tests.automated.core.helpers.common_test_procedures import run_collector_and_wait_for_completion import api.dependencies from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.enums import CollectorType from core.enums import BatchStatus +from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion def test_auto_googler_collector_lifecycle(test_core): diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 10802c77..575dedfa 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,10 +1,10 @@ -from tests.automated.core.helpers.common_test_procedures import run_collector_and_wait_for_completion import api.dependencies from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.enums import CollectorType from core.enums import BatchStatus from source_collectors.ckan.search_terms import group_search, package_search, organization_search +from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion def test_ckan_lifecycle(test_core): diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index d92fa0be..b688b0a8 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,10 +1,10 @@ -from tests.automated.core.helpers.common_test_procedures import run_collector_and_wait_for_completion -from tests.automated.core.helpers.constants import ALLEGHENY_COUNTY_TOWN_NAMES, ALLEGHENY_COUNTY_MUCKROCK_ID import api.dependencies from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.enums import CollectorType from core.enums import BatchStatus +from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion +from test_automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, ALLEGHENY_COUNTY_TOWN_NAMES def test_muckrock_simple_search_collector_lifecycle(test_core): diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index 00e1d57e..4689dbab 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -7,7 +7,7 @@ from source_collectors.muckrock.classes.MuckrockCollector import MuckrockSimpleSearchCollector, \ MuckrockCountyLevelSearchCollector, MuckrockAllFOIARequestsCollector from source_collectors.muckrock.schemas import MuckrockURLInfoSchema -from test_automated.integration.core.helpers import ALLEGHENY_COUNTY_MUCKROCK_ID, ALLEGHENY_COUNTY_TOWN_NAMES +from test_automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, ALLEGHENY_COUNTY_TOWN_NAMES def test_muckrock_simple_search_collector(): diff --git a/tests/test_alembic/helpers.py b/tests/test_alembic/helpers.py index 098ee1df..d6b2bea4 100644 --- a/tests/test_alembic/helpers.py +++ b/tests/test_alembic/helpers.py @@ -1,7 +1,9 @@ +from typing import Optional + from sqlalchemy import text from sqlalchemy.orm import Session -from tests.test_alembic.AlembicRunner import AlembicRunner +from tests.test_alembic.AlembicRunner import AlembicRunner def get_enum_values(enum_name: str, session: Session) -> list[str]: @@ -10,13 +12,23 @@ def get_enum_values(enum_name: str, session: Session) -> list[str]: def table_creation_check( alembic_runner: AlembicRunner, tables: list[str], - start_revision: str, - end_revision: str + end_revision: str, + start_revision: Optional[str] = None, + ): - alembic_runner.upgrade(start_revision) + if start_revision is not None: + alembic_runner.upgrade(start_revision) for table_name in tables: assert table_name not in alembic_runner.inspector.get_table_names() alembic_runner.upgrade(end_revision) alembic_runner.reflect() for table_name in tables: - assert table_name in alembic_runner.inspector.get_table_names() \ No newline at end of file + assert table_name in alembic_runner.inspector.get_table_names() + +def columns_in_table( + alembic_runner: AlembicRunner, + table_name: str, + columns_to_check: list[str], +) -> bool: + current_columns = [col["name"] for col in alembic_runner.inspector.get_columns(table_name)] + return all(column in current_columns for column in columns_to_check) diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 95684ce2..67869ab6 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -15,6 +15,7 @@ from sqlalchemy import text +from test_alembic.helpers import columns_in_table from tests.test_alembic.helpers import get_enum_values, table_creation_check @@ -304,6 +305,12 @@ def test_create_metadata_annotation_table(alembic_runner): ) def test_add_task_tables_and_linking_logic(alembic_runner): + alembic_runner.upgrade("dcd158092de0") + assert not columns_in_table( + alembic_runner, + table_name="url_error_info", + columns_to_check=["task_id"], + ) table_creation_check( alembic_runner, tables=[ @@ -311,6 +318,10 @@ def test_add_task_tables_and_linking_logic(alembic_runner): "task_errors", "link_task_urls" ], - start_revision="dcd158092de0", - end_revision="b0e34cec732a" + end_revision="072b32a45b1c" + ) + assert columns_in_table( + alembic_runner, + table_name="url_error_info", + columns_to_check=["task_id"], ) \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 7a0e9a6a..220b6645 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -5,15 +5,17 @@ from starlette.testclient import TestClient from collector_db.DTOs.BatchInfo import BatchInfo +from collector_db.DTOs.TaskInfo import TaskInfo +from collector_db.enums import TaskType from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo -from core.DTOs.LabelStudioExportResponseInfo import LabelStudioExportResponseInfo from core.DTOs.MessageCountResponse import MessageCountResponse from core.DTOs.MessageResponse import MessageResponse from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo @@ -160,12 +162,6 @@ def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse: ) return GetBatchLogsResponse(**data) - def export_batch_to_label_studio(self, batch_id: int) -> LabelStudioExportResponseInfo: - data = self.post( - url=f"/label-studio/export-batch/{batch_id}" - ) - return LabelStudioExportResponseInfo(**data) - def abort_batch(self, batch_id: int) -> MessageResponse: data = self.post( url=f"/batch/{batch_id}/abort" @@ -201,4 +197,30 @@ def get_urls(self, page: int = 1, errors: bool = False) -> GetURLsResponseInfo: url=f"/url", params={"page": page, "errors": errors} ) - return GetURLsResponseInfo(**data) \ No newline at end of file + return GetURLsResponseInfo(**data) + + def get_task_info(self, task_id: int) -> TaskInfo: + data = self.get( + url=f"/task/{task_id}" + ) + return TaskInfo(**data) + + def get_tasks( + self, + page: int = 1, + task_type: Optional[TaskType] = None, + task_status: Optional[BatchStatus] = None + ) -> GetTasksResponse: + params = {"page": page} + update_if_not_none( + target=params, + source={ + "task_type": task_type.value if task_type else None, + "task_status": task_status.value if task_status else None + } + ) + data = self.get( + url=f"/task", + params=params + ) + return GetTasksResponse(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_task.py b/tests/test_automated/integration/api/test_task.py new file mode 100644 index 00000000..64fbe75d --- /dev/null +++ b/tests/test_automated/integration/api/test_task.py @@ -0,0 +1,41 @@ +import pytest + +from collector_db.enums import TaskType +from test_automated.integration.api.conftest import APITestHelper + + +async def task_setup(ath: APITestHelper) -> int: + iui = ath.db_data_creator.urls(batch_id=ath.db_data_creator.batch(), url_count=3) + url_ids = [url.url_id for url in iui.url_mappings] + + task_id = await ath.db_data_creator.task(url_ids=url_ids) + await ath.db_data_creator.error_info(url_ids=[url_ids[0]], task_id=task_id) + + return task_id + +@pytest.mark.asyncio +async def test_get_task_info(api_test_helper): + ath = api_test_helper + + task_id = await task_setup(ath) + + task_info = ath.request_validator.get_task_info(task_id=task_id) + + assert len(task_info.urls) == 3 + assert len(task_info.url_errors) == 1 + + assert task_info.task_type == TaskType.HTML + +@pytest.mark.asyncio +async def test_get_tasks(api_test_helper): + ath = api_test_helper + for i in range(2): + await task_setup(ath) + + response = ath.request_validator.get_tasks(page=1, task_type=None, task_status=None) + + assert len(response.tasks) == 2 + for task in response.tasks: + assert task.type == TaskType.HTML + assert task.url_count == 3 + assert task.url_error_count == 1 diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 51d9a918..272f3de2 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -326,10 +326,3 @@ def test_root_url(db_data_creator: DBDataCreator): ) table_tester.run_column_tests() - -def test_task_url_links(db_data_creator: DBDataCreator): - # Create URLs - - # Create task - - # Associate URLs with task diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index feadf57f..fa3b7110 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -136,12 +136,14 @@ async def test_add_url_error_info(db_data_creator: DBDataCreator): url_ids = [url_mapping.url_id for url_mapping in url_mappings] adb_client = AsyncDatabaseClient() + task_id = await db_data_creator.task() error_infos = [] for url_mapping in url_mappings: uei = URLErrorPydanticInfo( url_id=url_mapping.url_id, error="test error", + task_id=task_id ) error_infos.append(uei) @@ -167,7 +169,9 @@ async def test_get_urls_with_html_data_and_no_relevancy_metadata( url_ids = [url_info.url_id for url_info in url_mappings] await db_data_creator.html_data(url_ids) await db_data_creator.metadata([url_ids[0]]) - results = await db_data_creator.adb_client.get_urls_with_html_data_and_no_relevancy_metadata() + results = await db_data_creator.adb_client.get_urls_with_html_data_and_without_metadata_type( + without_metadata_type=URLMetadataAttributeType.RELEVANT + ) permitted_url_ids = [url_id for url_id in url_ids if url_id != url_ids[0]] assert len(results) == 2 diff --git a/tests/test_automated/integration/tasks/__init__.py b/tests/test_automated/integration/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_automated/integration/tasks/test_example_task.py b/tests/test_automated/integration/tasks/test_example_task.py new file mode 100644 index 00000000..6e69bc89 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_example_task.py @@ -0,0 +1,87 @@ +import types + +import pytest + +from collector_db.enums import TaskType +from core.classes.TaskOperatorBase import TaskOperatorBase +from core.enums import BatchStatus +from helpers.DBDataCreator import DBDataCreator + +class ExampleTaskOperator(TaskOperatorBase): + + @property + def task_type(self) -> TaskType: + # Use TaskType.HTML so we don't have to add a test enum value to the db + return TaskType.HTML + + def inner_task_logic(self): + raise NotImplementedError + + async def meets_task_prerequisites(self): + return True + +@pytest.mark.asyncio +async def test_example_task_success(db_data_creator: DBDataCreator): + batch_id = db_data_creator.batch() + url_mappings = db_data_creator.urls( + batch_id=batch_id, + url_count=3 + ).url_mappings + url_ids = [url_info.url_id for url_info in url_mappings] + + async def mock_inner_task_logic(self): + # Add link to 3 urls + await self.adb_client.link_urls_to_task(task_id=self.task_id, url_ids=url_ids) + self.tasks_linked = True + + operator = ExampleTaskOperator(adb_client=db_data_creator.adb_client) + operator.inner_task_logic = types.MethodType(mock_inner_task_logic, operator) + + await operator.run_task() + + # Get Task Info + task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) + + # Check that 3 urls were linked to the task + assert len(task_info.urls) == 3 + + # Check that error info is empty + assert task_info.error_info is None + + # Check that the task was marked as complete + assert task_info.task_status == BatchStatus.COMPLETE + + # Check that the task type is HTML + assert task_info.task_type == TaskType.HTML + + + # Check that updated_at is not null + assert task_info.updated_at is not None + +@pytest.mark.asyncio +async def test_example_task_failure(db_data_creator: DBDataCreator): + operator = ExampleTaskOperator(adb_client=db_data_creator.adb_client) + + def mock_inner_task_logic(self): + raise ValueError("test error") + + operator.inner_task_logic = types.MethodType(mock_inner_task_logic, operator) + await operator.run_task() + + # Get Task Info + task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) + + # Check that there are no URLs associated + assert len(task_info.urls) == 0 + + # Check that the task was marked as errored + assert task_info.task_status == BatchStatus.ERROR + + # Check that the task type is HTML + assert task_info.task_type == TaskType.HTML + + # Check error + assert "test error" in task_info.error_info + + + diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py new file mode 100644 index 00000000..ff608b66 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -0,0 +1,105 @@ +import types +from typing import Optional + +import pytest + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import TaskType +from collector_db.models import Task +from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.enums import BatchStatus +from helpers.DBDataCreator import DBDataCreator +from helpers.assert_functions import assert_database_has_no_tasks +from html_tag_collector.DataClassTags import ResponseHTMLInfo +from html_tag_collector.ResponseParser import HTMLResponseParser +from html_tag_collector.RootURLCache import RootURLCache +from html_tag_collector.URLRequestInterface import URLRequestInterface, URLResponseInfo + + +@pytest.mark.asyncio +async def test_url_html_task(db_data_creator: DBDataCreator): + + mock_html_content = "" + mock_content_type = "text/html" + + async def mock_make_requests(self, urls: list[str]) -> list[URLResponseInfo]: + results = [] + for idx, url in enumerate(urls): + if idx == 2: + results.append( + URLResponseInfo( + success=False, + exception=ValueError("test error"), + content_type=mock_content_type + )) + else: + results.append(URLResponseInfo( + html=mock_html_content, success=True, content_type=mock_content_type)) + return results + + async def mock_parse(self, url: str, html_content: str, content_type: str) -> ResponseHTMLInfo: + assert html_content == mock_html_content + assert content_type == mock_content_type + return ResponseHTMLInfo( + url=url, + title="fake title", + description="fake description", + ) + + async def mock_get_from_cache(self, url: str) -> Optional[str]: + return None + + # Add mock methods or mock classes + url_request_interface = URLRequestInterface() + url_request_interface.make_requests = types.MethodType(mock_make_requests, url_request_interface) + + mock_root_url_cache = RootURLCache() + mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) + + html_parser = HTMLResponseParser( + root_url_cache=mock_root_url_cache + ) + html_parser.parse = types.MethodType(mock_parse, html_parser) + + operator = URLHTMLTaskOperator( + adb_client=AsyncDatabaseClient(), + url_request_interface=url_request_interface, + html_parser=html_parser + ) + await operator.run_task() + + # Check that, because no URLs were created, the task did not run + await assert_database_has_no_tasks(db_data_creator.adb_client) + + batch_id = db_data_creator.batch() + url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings + url_ids = [url_info.url_id for url_info in url_mappings] + + await operator.run_task() + + + # Check in database that + # - task is listed as complete + # - task type is listed as 'HTML' + # - task has 3 urls + # - task has one errored url with error "ValueError" + task_info = await db_data_creator.adb_client.get_task_info( + task_id=operator.task_id + ) + + assert task_info.error_info is None + assert task_info.task_status == BatchStatus.COMPLETE + assert task_info.task_type == TaskType.HTML + + assert len(task_info.urls) == 3 + assert len(task_info.url_errors) == 1 + assert task_info.url_errors[0].error == "test error" + + adb = db_data_creator.adb_client + # Check that both success urls have two rows of HTML data + hci = await adb.get_html_content_info(url_id=task_info.urls[0].id) + assert len(hci) == 2 + hci = await adb.get_html_content_info(url_id=task_info.urls[1].id) + assert len(hci) == 2 + + # Check that errored url has error info diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py new file mode 100644 index 00000000..349a4e23 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -0,0 +1,47 @@ +from unittest.mock import MagicMock + +import pytest + +from collector_db.enums import TaskType +from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator +from core.enums import RecordType, BatchStatus +from helpers.assert_functions import assert_database_has_no_tasks +from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier + + +@pytest.mark.asyncio +async def test_url_record_type_task(db_data_creator): + + mock_classifier = MagicMock(spec=DeepSeekRecordClassifier) + mock_classifier.classify_url.side_effect = [RecordType.ACCIDENT_REPORTS, "Error"] + + operator = URLRecordTypeTaskOperator( + adb_client=db_data_creator.adb_client, + classifier=mock_classifier + ) + + await operator.run_task() + + # No task should have been created due to not meeting prerequisites + await assert_database_has_no_tasks(db_data_creator.adb_client) + + batch_id = db_data_creator.batch() + iui = db_data_creator.urls(batch_id=batch_id, url_count=2) + url_ids = [iui.url_mappings[0].url_id, iui.url_mappings[1].url_id] + await db_data_creator.html_data(url_ids) + + await operator.run_task() + + + # Task should have been created + task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) + assert task_info.error_info is None + assert task_info.task_status == BatchStatus.COMPLETE + + response = await db_data_creator.adb_client.get_tasks() + tasks = response.tasks + assert len(tasks) == 1 + task = tasks[0] + assert task.type == TaskType.RECORD_TYPE + assert task.url_count == 2 + assert task.url_error_count == 1 diff --git a/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py similarity index 82% rename from tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py rename to tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index 3ff2c846..1cab4ee5 100644 --- a/tests/test_automated/integration/cycles/test_url_relevancy_huggingface_cycle.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -5,18 +5,15 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import ValidationStatus, ValidationSource -from collector_db.models import URLMetadata +from collector_db.models import URLMetadata, Task from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from helpers.assert_functions import assert_database_has_no_tasks from hugging_face.HuggingFaceInterface import HuggingFaceInterface @pytest.mark.asyncio -async def test_url_relevancy_huggingface_cycle(db_data_creator): - batch_id = db_data_creator.batch() - url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings - url_ids = [url_info.url_id for url_info in url_mappings] - await db_data_creator.html_data(url_ids) - await db_data_creator.metadata([url_ids[0]]) +async def test_url_relevancy_huggingface_task(db_data_creator): + def num_to_bool(num: int) -> bool: if num == 0: @@ -38,11 +35,21 @@ def mock_get_url_relevancy( mock_hf_interface = MagicMock(spec=HuggingFaceInterface) mock_hf_interface.get_url_relevancy = mock_get_url_relevancy - cycler = URLRelevanceHuggingfaceTaskOperator( + task_operator = URLRelevanceHuggingfaceTaskOperator( adb_client=AsyncDatabaseClient(), huggingface_interface=mock_hf_interface ) - await cycler.run_task() + await task_operator.run_task() + + await assert_database_has_no_tasks(db_data_creator.adb_client) + + batch_id = db_data_creator.batch() + url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings + url_ids = [url_info.url_id for url_info in url_mappings] + await db_data_creator.html_data(url_ids) + await db_data_creator.metadata([url_ids[0]]) + + await task_operator.run_task() results = await db_data_creator.adb_client.get_all(URLMetadata) From 698902e9d20ea34b31d80635a09846399422018c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 28 Jan 2025 18:34:00 -0500 Subject: [PATCH 019/182] Create `/task` route --- api/routes/task.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 api/routes/task.py diff --git a/api/routes/task.py b/api/routes/task.py new file mode 100644 index 00000000..d9cdbeac --- /dev/null +++ b/api/routes/task.py @@ -0,0 +1,49 @@ +from typing import Optional + +from fastapi import APIRouter, Depends, Query, Path + +from api.dependencies import get_async_core +from collector_db.DTOs.TaskInfo import TaskInfo +from collector_db.enums import TaskType +from core.AsyncCore import AsyncCore +from core.enums import BatchStatus +from security_manager.SecurityManager import AccessInfo, get_access_info + +task_router = APIRouter( + prefix="/task", + tags=["Task"], + responses={404: {"description": "Not found"}}, +) + + +@task_router.get("") +async def get_tasks( + page: int = Query( + description="The page number", + default=1 + ), + task_status: Optional[BatchStatus] = Query( + description="Filter by task status", + default=None + ), + task_type: Optional[TaskType] = Query( + description="Filter by task type", + default=None + ), + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +): + return await async_core.get_tasks( + page=page, + task_type=task_type, + task_status=task_status + ) + + +@task_router.get("/{task_id}") +async def get_task_info( + task_id: int = Path(description="The task id"), + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +) -> TaskInfo: + return await async_core.get_task_info(task_id) \ No newline at end of file From fbc9bcb5a606442338e26a98a86bbc4c4c7519c0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 28 Jan 2025 18:34:09 -0500 Subject: [PATCH 020/182] Rename directory --- .../cycles => core/DTOs/task_data_objects}/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tests/test_automated/integration/cycles => core/DTOs/task_data_objects}/__init__.py (100%) diff --git a/tests/test_automated/integration/cycles/__init__.py b/core/DTOs/task_data_objects/__init__.py similarity index 100% rename from tests/test_automated/integration/cycles/__init__.py rename to core/DTOs/task_data_objects/__init__.py From b90a7bdac8614e1276437ea6a97596bbf012f0fb Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 28 Jan 2025 18:34:24 -0500 Subject: [PATCH 021/182] Change name of url_error_info foreign key constraint --- .../072b32a45b1c_add_task_tables_and_linking_logic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py index dcae164b..b6670ff1 100644 --- a/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py +++ b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py @@ -63,12 +63,12 @@ def upgrade() -> None: op.add_column('url_error_info', sa.Column('task_id', sa.Integer(), nullable=False)) op.create_unique_constraint('uq_url_id_error', 'url_error_info', ['url_id', 'task_id']) - op.create_foreign_key("fk_url_error_info_task", 'url_error_info', 'tasks', ['task_id'], ['id']) + op.create_foreign_key("url_error_info_task_id_fkey", 'url_error_info', 'tasks', ['task_id'], ['id']) def downgrade() -> None: - op.drop_constraint("fk_url_error_info_task", 'url_error_info', type_='foreignkey') + op.drop_constraint("url_error_info_task_id_fkey", 'url_error_info', type_='foreignkey') op.drop_constraint('uq_url_id_error', 'url_error_info', type_='unique') op.drop_column('url_error_info', 'task_id') op.drop_table('link_task_urls') From 2cecaf52a241cc2afad1c0897cd5d85ea83f271e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:15:37 -0500 Subject: [PATCH 022/182] Add logic for automatically assigning values to record types via OpenAI. --- Dockerfile | 3 +- collector_db/DTOs/URLMetadataInfo.py | 1 + ...45b1c_add_task_tables_and_linking_logic.py | 2 + collector_db/models.py | 2 + core/AsyncCore.py | 4 +- core/ScheduledTaskManager.py | 3 +- core/classes/TaskOperatorBase.py | 2 +- core/classes/URLRecordTypeTaskOperator.py | 10 +- llm_api_logic/DeepSeekRecordClassifier.py | 103 ++++-------------- llm_api_logic/LLMRecordClassifierBase.py | 76 +++++++++++++ llm_api_logic/OpenAIRecordClassifier.py | 34 ++++++ llm_api_logic/RecordTypeStructuredOutput.py | 13 +++ llm_api_logic/constants.py | 48 ++++++++ llm_api_logic/helpers.py | 8 ++ local_database/DataDumper/docker-compose.yml | 4 +- local_database/DataDumper/dump.sh | 4 - local_database/DataDumper/restore.sh | 10 +- tests/helpers/DBDataCreator.py | 2 +- .../test_openai_record_classifier.py | 26 +++++ tests/test_alembic/test_revisions.py | 10 ++ .../tasks/test_url_record_type_task.py | 15 ++- 21 files changed, 266 insertions(+), 114 deletions(-) create mode 100644 llm_api_logic/LLMRecordClassifierBase.py create mode 100644 llm_api_logic/OpenAIRecordClassifier.py create mode 100644 llm_api_logic/RecordTypeStructuredOutput.py create mode 100644 llm_api_logic/constants.py create mode 100644 llm_api_logic/helpers.py create mode 100644 tests/manual/llm_api_logic/test_openai_record_classifier.py diff --git a/Dockerfile b/Dockerfile index 86bd21b1..e59b96f2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,7 @@ COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager COPY execute.sh ./execute.sh COPY .project-root ./.project-root +COPY llm_api_logic ./llm_api_logic # Expose the application port EXPOSE 80 @@ -36,4 +37,4 @@ EXPOSE 80 RUN chmod +x execute.sh # Use the below for ease of local development, but remove when pushing to GitHub # Because there is no .env file in the repository (for security reasons) -#COPY .env ./.env +COPY .env ./.env diff --git a/collector_db/DTOs/URLMetadataInfo.py b/collector_db/DTOs/URLMetadataInfo.py index 9cbc7dca..461d16e9 100644 --- a/collector_db/DTOs/URLMetadataInfo.py +++ b/collector_db/DTOs/URLMetadataInfo.py @@ -12,6 +12,7 @@ class URLMetadataInfo(BaseModel): attribute: Optional[URLMetadataAttributeType] = None # TODO: May need to add validation here depending on the type of attribute value: Optional[str] = None + notes: Optional[str] = None validation_status: Optional[ValidationStatus] = None validation_source: Optional[ValidationSource] = None created_at: Optional[datetime] = None diff --git a/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py index b6670ff1..b2174484 100644 --- a/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py +++ b/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py @@ -62,6 +62,7 @@ def upgrade() -> None: op.execute("DELETE FROM url_error_info;") op.add_column('url_error_info', sa.Column('task_id', sa.Integer(), nullable=False)) + op.add_column("url_metadata", sa.Column('notes', sa.Text(), nullable=True)) op.create_unique_constraint('uq_url_id_error', 'url_error_info', ['url_id', 'task_id']) op.create_foreign_key("url_error_info_task_id_fkey", 'url_error_info', 'tasks', ['task_id'], ['id']) @@ -71,6 +72,7 @@ def downgrade() -> None: op.drop_constraint("url_error_info_task_id_fkey", 'url_error_info', type_='foreignkey') op.drop_constraint('uq_url_id_error', 'url_error_info', type_='unique') op.drop_column('url_error_info', 'task_id') + op.drop_column('url_metadata', 'notes') op.drop_table('link_task_urls') op.drop_table('task_errors') op.drop_table('tasks') diff --git a/collector_db/models.py b/collector_db/models.py index f1eac526..aa33d41e 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -114,6 +114,8 @@ class URLMetadata(Base): PGEnum('Machine Learning', 'Label Studio', 'Manual', name='validation_source'), nullable=False ) + notes = Column(Text, nullable=True) + # Timestamps created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 5318a044..afa5c7ab 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -17,7 +17,7 @@ from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface -from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier +from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier class AsyncCore: @@ -57,7 +57,7 @@ async def run_url_record_type_task(self): self.logger.info("Running URL Record Type Task") operator = URLRecordTypeTaskOperator( adb_client=self.adb_client, - classifier=DeepSeekRecordClassifier() + classifier=OpenAIRecordClassifier() ) await operator.run_task() diff --git a/core/ScheduledTaskManager.py b/core/ScheduledTaskManager.py index e061adee..5b2ff0a7 100644 --- a/core/ScheduledTaskManager.py +++ b/core/ScheduledTaskManager.py @@ -56,7 +56,8 @@ def add_scheduled_tasks(self): trigger=IntervalTrigger( hours=1, start_date=datetime.now() + timedelta(minutes=1) - ) + ), + misfire_grace_time=60 ) def shutdown(self): diff --git a/core/classes/TaskOperatorBase.py b/core/classes/TaskOperatorBase.py index 6fd86c97..7998713c 100644 --- a/core/classes/TaskOperatorBase.py +++ b/core/classes/TaskOperatorBase.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType from core.enums import BatchStatus diff --git a/core/classes/URLRecordTypeTaskOperator.py b/core/classes/URLRecordTypeTaskOperator.py index 18ac8ef4..6287bcae 100644 --- a/core/classes/URLRecordTypeTaskOperator.py +++ b/core/classes/URLRecordTypeTaskOperator.py @@ -5,7 +5,7 @@ from core.DTOs.task_data_objects.URLRecordTypeTDO import URLRecordTypeTDO from core.classes.TaskOperatorBase import TaskOperatorBase from core.enums import RecordType -from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier +from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier class URLRecordTypeTaskOperator(TaskOperatorBase): @@ -13,7 +13,7 @@ class URLRecordTypeTaskOperator(TaskOperatorBase): def __init__( self, adb_client: AsyncDatabaseClient, - classifier: DeepSeekRecordClassifier + classifier: OpenAIRecordClassifier ): super().__init__(adb_client) self.classifier = classifier @@ -63,14 +63,14 @@ async def put_results_into_database(self, tdos: list[URLRecordTypeTDO]): url_metadata = URLMetadataInfo( url_id=tdo.url_with_html.url_id, attribute=URLMetadataAttributeType.RECORD_TYPE, - value=str(tdo.record_type), + value=str(tdo.record_type.value), validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING + validation_source=ValidationSource.MACHINE_LEARNING, + notes=self.classifier.model_name ) url_metadatas.append(url_metadata) await self.adb_client.add_url_metadatas(url_metadatas) - async def separate_success_and_error_subsets(self, tdos: list[URLRecordTypeTDO]): success_subset = [tdo for tdo in tdos if not tdo.is_errored()] error_subset = [tdo for tdo in tdos if tdo.is_errored()] diff --git a/llm_api_logic/DeepSeekRecordClassifier.py b/llm_api_logic/DeepSeekRecordClassifier.py index 5a2067e0..67f6fa09 100644 --- a/llm_api_logic/DeepSeekRecordClassifier.py +++ b/llm_api_logic/DeepSeekRecordClassifier.py @@ -5,94 +5,29 @@ from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo from core.enums import RecordType +from llm_api_logic.LLMRecordClassifierBase import RecordClassifierBase -QUERY_CONTENT = """ - You will be provided with structured data from a web page and determine - the record type. - - The record types are as follows +class DeepSeekRecordClassifier(RecordClassifierBase): - "Accident Reports": Records of vehicle accidents. - "Arrest Records": Records of each arrest made in the agency's jurisdiction. - "Calls for Service": Records of officers initiating activity or responding to requests for police response. Often called "Dispatch Logs" or "Incident Reports" when published. - "Car GPS": Records of police car location. Not generally posted online. - "Citations": Records of low-level criminal offenses where a police officer issued a citation instead of an arrest. - "Dispatch Logs": Records of calls or orders made by police dispatchers. - "Dispatch Recordings": Audio feeds and/or archives of municipal dispatch channels. - "Field Contacts": Reports of contact between police and civilians. May include uses of force, incidents, arrests, or contacts where nothing notable happened. - "Incident Reports": Reports made by police officers after responding to a call which may or may not be criminal in nature. Not generally posted online. - "Misc Police Activity": Records or descriptions of police activity not covered by other record types. - "Officer Involved Shootings": Case files of gun violence where a police officer was involved, typically as the shooter. Detailed, often containing references to records like Media Bulletins and Use of Force Reports. - "Stops": Records of pedestrian or traffic stops made by police. - "Surveys": Information captured from a sample of some population, like incarcerated people or magistrate judges. Often generated independently. - "Use of Force Reports": Records of use of force against civilians by police officers. - "Vehicle Pursuits": Records of cases where police pursued a person fleeing in a vehicle. - "Complaints & Misconduct": Records, statistics, or summaries of complaints and misconduct investigations into law enforcement officers. - "Daily Activity Logs": Officer-created reports or time sheets of what happened on a shift. Not generally posted online. - "Training & Hiring Info": Records and descriptions of additional training for police officers. - "Personnel Records": Records of hiring and firing, certification, discipline, and other officer-specific events. Not generally posted online. - "Annual & Monthly Reports": Often in PDF form, featuring summaries or high-level updates about the police force. Can contain versions of other record types, especially summaries. - "Budgets & Finances": Budgets, finances, grants, or other financial documents. - "Contact Info & Agency Meta": Information about organizational structure, including department structure and contact info. - "Geographic": Maps or geographic data about how land is divided up into municipal sectors, zones, and jurisdictions. - "List of Data Sources": Places on the internet, often data portal homepages, where many links to potential data sources can be found. - "Policies & Contracts": Policies or contracts related to agency procedure. - "Crime Maps & Reports": Records of individual crimes in map or table form for a given jurisdiction. - "Crime Statistics": Summarized information about crime in a given jurisdiction. - "Media Bulletins": Press releases, blotters, or blogs intended to broadly communicate alerts, requests, or other timely information. - "Records Request Info": Portals, forms, policies, or other resources for making public records requests. - "Resources": Agency-provided information or guidance about services, prices, best practices, etc. - "Sex Offender Registry": Index of people registered, usually by law, with the government as sex offenders. - "Wanted Persons": Names, descriptions, images, and associated information about people with outstanding arrest warrants. - "Booking Reports": Records of booking or intake into corrections institutions. - "Court Cases": Records such as dockets about individual court cases. - "Incarceration Records": Records of current inmates, often with full names and features for notification upon inmate release. - "Other": Other record types not otherwise described. - Output the record type in the following format. Do not include any other information: + @property + def api_key(self): + return os.getenv("DEEPSEEK_API_KEY") - { - "record_type": "" - } - """ + @property + def model_name(self): + return "deepseek-chat" -def dictify_html_info(html_infos: list[URLHTMLContentInfo]) -> dict[str, str]: - d = {} - for html_info in html_infos: - d[html_info.content_type.value] = html_info.content - return d + @property + def base_url(self): + return "https://api.deepseek.com" -class DeepSeekRecordClassifier: + @property + def response_format(self): + return { + 'type': 'json_object' + } - def __init__(self): - self.client = AsyncOpenAI( - api_key=os.getenv("DEEPSEEK_API_KEY"), - base_url="https://api.deepseek.com" - ) - - def build_query_messages(self, content_infos: list[URLHTMLContentInfo]) -> list[dict[str, str]]: - insert_content = dictify_html_info(content_infos) - return [ - { - "role": "system", - "content": QUERY_CONTENT - }, - { - "role": "user", - "content": f"```json{insert_content}```" - } - ] - - async def classify_url(self, content_infos: list[URLHTMLContentInfo]) -> RecordType: - response = await self.client.chat.completions.create( - model="deepseek-chat", - messages=self.build_query_messages(content_infos), - stream=False, - response_format={ - 'type': 'json_object' - } - ) - result_str = response.choices[0].message.content - - result_dict = json.loads(result_str) - return result_dict["record_type"] + @property + def completions_func(self) -> callable: + return AsyncOpenAI.chat.completions.create \ No newline at end of file diff --git a/llm_api_logic/LLMRecordClassifierBase.py b/llm_api_logic/LLMRecordClassifierBase.py new file mode 100644 index 00000000..85142aea --- /dev/null +++ b/llm_api_logic/LLMRecordClassifierBase.py @@ -0,0 +1,76 @@ +import json +from abc import ABC, abstractmethod +from typing import Any + +from openai import AsyncOpenAI + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from llm_api_logic.RecordTypeStructuredOutput import RecordTypeStructuredOutput +from llm_api_logic.constants import RECORD_CLASSIFICATION_QUERY_CONTENT +from llm_api_logic.helpers import dictify_html_info + + +class RecordClassifierBase(ABC): + + def __init__(self): + self.client = AsyncOpenAI( + api_key=self.api_key, + base_url=self.base_url + ) + + @property + @abstractmethod + def api_key(self) -> str: + pass + + @property + @abstractmethod + def model_name(self) -> str: + pass + + @property + @abstractmethod + def base_url(self) -> str: + pass + + @property + @abstractmethod + def response_format(self) -> dict | RecordTypeStructuredOutput: + pass + + @property + @abstractmethod + def completions_func(self) -> callable: + pass + + def build_query_messages(self, content_infos: list[URLHTMLContentInfo]) -> list[dict[str, str]]: + insert_content = dictify_html_info(content_infos) + return [ + { + "role": "system", + "content": RECORD_CLASSIFICATION_QUERY_CONTENT + }, + { + "role": "user", + "content": str(insert_content) + } + ] + + @abstractmethod + def post_process_response(self, response: Any) -> str: + pass + + async def classify_url(self, content_infos: list[URLHTMLContentInfo]) -> str: + func = self.completions_func + response = await func( + model=self.model_name, + messages=self.build_query_messages(content_infos), + #stream=False, # Note that this is set for DeepSeek, but may not be needed for it + response_format=self.response_format + ) + return self.post_process_response(response) + + result_str = response.choices[0].message.content + + result_dict = json.loads(result_str) + return result_dict["record_type"] \ No newline at end of file diff --git a/llm_api_logic/OpenAIRecordClassifier.py b/llm_api_logic/OpenAIRecordClassifier.py new file mode 100644 index 00000000..fc20a0e2 --- /dev/null +++ b/llm_api_logic/OpenAIRecordClassifier.py @@ -0,0 +1,34 @@ +from typing import Any + +from openai.types.chat import ParsedChatCompletion + +from llm_api_logic.LLMRecordClassifierBase import RecordClassifierBase +from llm_api_logic.RecordTypeStructuredOutput import RecordTypeStructuredOutput +from util.helper_functions import get_from_env + + +class OpenAIRecordClassifier(RecordClassifierBase): + + @property + def api_key(self): + return get_from_env("OPENAI_API_KEY") + + @property + def model_name(self): + return "gpt-4o-mini-2024-07-18" + + @property + def base_url(self): + return None + + @property + def response_format(self): + return RecordTypeStructuredOutput + + @property + def completions_func(self) -> callable: + return self.client.beta.chat.completions.parse + + def post_process_response(self, response: ParsedChatCompletion) -> str: + output: RecordTypeStructuredOutput = response.choices[0].message.parsed + return output.record_type.value \ No newline at end of file diff --git a/llm_api_logic/RecordTypeStructuredOutput.py b/llm_api_logic/RecordTypeStructuredOutput.py new file mode 100644 index 00000000..a5993ae9 --- /dev/null +++ b/llm_api_logic/RecordTypeStructuredOutput.py @@ -0,0 +1,13 @@ +""" +Used per the guidance given in Open AI's documentation on structured outputs: +https://platform.openai.com/docs/guides/structured-outputs +""" + +from pydantic import BaseModel + +from core.enums import RecordType + + + +class RecordTypeStructuredOutput(BaseModel): + record_type: RecordType \ No newline at end of file diff --git a/llm_api_logic/constants.py b/llm_api_logic/constants.py new file mode 100644 index 00000000..55133abf --- /dev/null +++ b/llm_api_logic/constants.py @@ -0,0 +1,48 @@ +RECORD_CLASSIFICATION_QUERY_CONTENT = """ + You will be provided with structured data from a web page and determine + the record type. + + The record types are as follows + + "Accident Reports": Records of vehicle accidents. + "Arrest Records": Records of each arrest made in the agency's jurisdiction. + "Calls for Service": Records of officers initiating activity or responding to requests for police response. Often called "Dispatch Logs" or "Incident Reports" when published. + "Car GPS": Records of police car location. Not generally posted online. + "Citations": Records of low-level criminal offenses where a police officer issued a citation instead of an arrest. + "Dispatch Logs": Records of calls or orders made by police dispatchers. + "Dispatch Recordings": Audio feeds and/or archives of municipal dispatch channels. + "Field Contacts": Reports of contact between police and civilians. May include uses of force, incidents, arrests, or contacts where nothing notable happened. + "Incident Reports": Reports made by police officers after responding to a call which may or may not be criminal in nature. Not generally posted online. + "Misc Police Activity": Records or descriptions of police activity not covered by other record types. + "Officer Involved Shootings": Case files of gun violence where a police officer was involved, typically as the shooter. Detailed, often containing references to records like Media Bulletins and Use of Force Reports. + "Stops": Records of pedestrian or traffic stops made by police. + "Surveys": Information captured from a sample of some population, like incarcerated people or magistrate judges. Often generated independently. + "Use of Force Reports": Records of use of force against civilians by police officers. + "Vehicle Pursuits": Records of cases where police pursued a person fleeing in a vehicle. + "Complaints & Misconduct": Records, statistics, or summaries of complaints and misconduct investigations into law enforcement officers. + "Daily Activity Logs": Officer-created reports or time sheets of what happened on a shift. Not generally posted online. + "Training & Hiring Info": Records and descriptions of additional training for police officers. + "Personnel Records": Records of hiring and firing, certification, discipline, and other officer-specific events. Not generally posted online. + "Annual & Monthly Reports": Often in PDF form, featuring summaries or high-level updates about the police force. Can contain versions of other record types, especially summaries. + "Budgets & Finances": Budgets, finances, grants, or other financial documents. + "Contact Info & Agency Meta": Information about organizational structure, including department structure and contact info. + "Geographic": Maps or geographic data about how land is divided up into municipal sectors, zones, and jurisdictions. + "List of Data Sources": Places on the internet, often data portal homepages, where many links to potential data sources can be found. + "Policies & Contracts": Policies or contracts related to agency procedure. + "Crime Maps & Reports": Records of individual crimes in map or table form for a given jurisdiction. + "Crime Statistics": Summarized information about crime in a given jurisdiction. + "Media Bulletins": Press releases, blotters, or blogs intended to broadly communicate alerts, requests, or other timely information. + "Records Request Info": Portals, forms, policies, or other resources for making public records requests. + "Resources": Agency-provided information or guidance about services, prices, best practices, etc. + "Sex Offender Registry": Index of people registered, usually by law, with the government as sex offenders. + "Wanted Persons": Names, descriptions, images, and associated information about people with outstanding arrest warrants. + "Booking Reports": Records of booking or intake into corrections institutions. + "Court Cases": Records such as dockets about individual court cases. + "Incarceration Records": Records of current inmates, often with full names and features for notification upon inmate release. + "Other": Other record types not otherwise described. + + Output the record type in the following JSON format: + { + "record_type": "" + } + """ diff --git a/llm_api_logic/helpers.py b/llm_api_logic/helpers.py new file mode 100644 index 00000000..3d5bde11 --- /dev/null +++ b/llm_api_logic/helpers.py @@ -0,0 +1,8 @@ +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo + + +def dictify_html_info(html_infos: list[URLHTMLContentInfo]) -> dict[str, str]: + d = {} + for html_info in html_infos: + d[html_info.content_type.value] = html_info.content + return d diff --git a/local_database/DataDumper/docker-compose.yml b/local_database/DataDumper/docker-compose.yml index 4a28c5e8..f24c78b5 100644 --- a/local_database/DataDumper/docker-compose.yml +++ b/local_database/DataDumper/docker-compose.yml @@ -22,6 +22,6 @@ services: entrypoint: [ "bash", # Comment out one of the following lines depending on your needs - "/usr/local/bin/dump.sh" -# "/usr/local/bin/restore.sh" +# "/usr/local/bin/dump.sh" + "/usr/local/bin/restore.sh" ] \ No newline at end of file diff --git a/local_database/DataDumper/dump.sh b/local_database/DataDumper/dump.sh index 6f1954c4..fd63c65f 100644 --- a/local_database/DataDumper/dump.sh +++ b/local_database/DataDumper/dump.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e - # Variables (customize these or pass them as environment variables) DB_HOST=${DUMP_HOST:-"postgres_container"} DB_USER=${DUMP_USER:-"your_user"} @@ -8,12 +7,9 @@ DB_PORT=${DUMP_PORT:-"5432"} # Default to 5432 if not provided DB_PASSWORD=${DUMP_PASSWORD:-"your_password"} DB_NAME=${DUMP_NAME:-"your_database"} DUMP_FILE=${DUMP_FILE:-"/dump/db_dump.sql"} - # Export password for pg_dump export PGPASSWORD=$DB_PASSWORD - # Dump the database echo "Dumping database $DB_NAME from $DB_HOST:$DB_PORT..." pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME --no-owner --no-acl -F c -f $DUMP_FILE - echo "Dump completed. File saved to $DUMP_FILE." \ No newline at end of file diff --git a/local_database/DataDumper/restore.sh b/local_database/DataDumper/restore.sh index d2046fb0..ff62349e 100644 --- a/local_database/DataDumper/restore.sh +++ b/local_database/DataDumper/restore.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e - # Variables (customize these or pass them as environment variables) DB_HOST=${RESTORE_HOST:-"postgres_container"} DB_USER=${RESTORE_USER:-"your_user"} @@ -8,15 +7,11 @@ DB_PORT=${RESTORE_PORT:-"5432"} # Default to 5432 if not provided DB_PASSWORD=${RESTORE_PASSWORD:-"your_password"} NEW_DB_NAME=${RESTORE_DB_NAME:-"new_database"} # Name of the database to restore into DUMP_FILE=${DUMP_FILE:-"/dump/db_dump.sql"} - MAINTENANCE_DB="postgres" - # Export password for pg_restore export PGPASSWORD=$DB_PASSWORD - CONNECTION_STRING="postgresql://$DB_USER:$DB_PASSWORD@$DB_HOST:$DB_PORT/$NEW_DB_NAME" MAINT_CONNECTION_STRING="postgresql://$DB_USER:$DB_PASSWORD@$DB_HOST:$DB_PORT/$MAINTENANCE_DB" - echo "Checking if database $NEW_DB_NAME exists on $DB_HOST:$DB_PORT..." psql -d $MAINT_CONNECTION_STRING -tc "SELECT 1 FROM pg_database WHERE datname = '$NEW_DB_NAME';" | grep -q 1 && { echo "Database $NEW_DB_NAME exists. Dropping it..." @@ -25,16 +20,13 @@ psql -d $MAINT_CONNECTION_STRING -tc "SELECT 1 FROM pg_database WHERE datname = # Drop the database psql -d $MAINT_CONNECTION_STRING -c "DROP DATABASE $NEW_DB_NAME;" } - # Create the new database echo "Creating new database $NEW_DB_NAME on $DB_HOST:$DB_PORT..." psql -d $MAINT_CONNECTION_STRING -c "CREATE DATABASE $NEW_DB_NAME;" || { echo "Failed to create database $NEW_DB_NAME. It might already exist." exit 1 } - # Restore the dump into the new database echo "Restoring dump from $DUMP_FILE into database $NEW_DB_NAME..." pg_restore -d $CONNECTION_STRING --no-owner --no-acl -F c $DUMP_FILE - -echo "Database restoration completed." +echo "Database restoration completed." \ No newline at end of file diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index c9c26846..0041fad5 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -21,7 +21,7 @@ class DBDataCreator: """ def __init__(self, db_client: DatabaseClient = DatabaseClient()): self.db_client = db_client - self.adb_client = AsyncDatabaseClient() + self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() def batch(self): return self.db_client.insert_batch( diff --git a/tests/manual/llm_api_logic/test_openai_record_classifier.py b/tests/manual/llm_api_logic/test_openai_record_classifier.py new file mode 100644 index 00000000..72d474d2 --- /dev/null +++ b/tests/manual/llm_api_logic/test_openai_record_classifier.py @@ -0,0 +1,26 @@ +import pytest + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier + + +@pytest.mark.asyncio +async def test_openai_record_classifier(): + from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType as hct + + d = { + hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", + hct.DESCRIPTION: "At the Thursday, November 2 regular city council meeting, Chief Evans administered the oath of office and swearing in of Corporal Cody Lumpkin. Corporal Lumpkin was surrounded by his family and members of the Acworth Police Department for the occasion. Corporal Lumpkin began employment with the Acworth Police Department on June 8,", + hct.H3: ["Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police"], + hct.H4: ["Share this on Social Media"], + hct.DIV: "PHONE DIRECTORY RESOURCES Search for: Search Button NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Administration Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police Published On: November 3, 2023 At the Thursday, November 2 regular city council meeting, Chief Evans administered the oath of office and swearing in of Corporal Cody Lumpkin.  Corporal Lumpkin was surrounded by his family and members of the Acworth Police Department for the occasion.  Corporal Lumpkin began employment with the Acworth Police Department on June 8 , 2015, and has served as a patrol officer in addition to time spent time in Special Operations prior to his recent promotion. Share this on Social Media 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2025 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Peak | Laserfiche | Login ", + } + content_infos = [] + for content_type, value in d.items(): + content_info = URLHTMLContentInfo(content_type=content_type, content=value) + content_infos.append(content_info) + + classifier = OpenAIRecordClassifier() + result = await classifier.classify_url(content_infos) + print(type(result)) + print(result) \ No newline at end of file diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 67869ab6..1a5516c0 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -311,6 +311,11 @@ def test_add_task_tables_and_linking_logic(alembic_runner): table_name="url_error_info", columns_to_check=["task_id"], ) + assert not columns_in_table( + alembic_runner, + table_name="url_metadata", + columns_to_check=["notes"], + ) table_creation_check( alembic_runner, tables=[ @@ -324,4 +329,9 @@ def test_add_task_tables_and_linking_logic(alembic_runner): alembic_runner, table_name="url_error_info", columns_to_check=["task_id"], + ) + assert columns_in_table( + alembic_runner, + table_name="url_metadata", + columns_to_check=["notes"], ) \ No newline at end of file diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py index 349a4e23..a3b336cd 100644 --- a/tests/test_automated/integration/tasks/test_url_record_type_task.py +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -3,23 +3,24 @@ import pytest from collector_db.enums import TaskType +from collector_db.models import URLMetadata from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.enums import RecordType, BatchStatus +from helpers.DBDataCreator import DBDataCreator from helpers.assert_functions import assert_database_has_no_tasks from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier - @pytest.mark.asyncio -async def test_url_record_type_task(db_data_creator): +async def test_url_record_type_task(db_data_creator: DBDataCreator): mock_classifier = MagicMock(spec=DeepSeekRecordClassifier) mock_classifier.classify_url.side_effect = [RecordType.ACCIDENT_REPORTS, "Error"] + mock_classifier.model_name = "test_notes" operator = URLRecordTypeTaskOperator( adb_client=db_data_creator.adb_client, classifier=mock_classifier ) - await operator.run_task() # No task should have been created due to not meeting prerequisites @@ -32,7 +33,6 @@ async def test_url_record_type_task(db_data_creator): await operator.run_task() - # Task should have been created task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) assert task_info.error_info is None @@ -45,3 +45,10 @@ async def test_url_record_type_task(db_data_creator): assert task.type == TaskType.RECORD_TYPE assert task.url_count == 2 assert task.url_error_count == 1 + + # Get metadata + metadata_results = await db_data_creator.adb_client.get_all(URLMetadata) + for metadata_row in metadata_results: + assert metadata_row.notes == "test_notes" + assert metadata_row.value == RecordType.ACCIDENT_REPORTS.value + From e139656865c2c9fe9a10f66aaff4863d2c504e49 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:21:04 -0500 Subject: [PATCH 023/182] Add tests to Dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index e59b96f2..0f241e7c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,7 @@ COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager COPY execute.sh ./execute.sh COPY .project-root ./.project-root +COPY tests ./tests COPY llm_api_logic ./llm_api_logic # Expose the application port From c3b8752e0a74d6d0d6d44abbf34d781ae08f61d2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:25:07 -0500 Subject: [PATCH 024/182] Fix error in import routing --- .../integration/tasks/test_url_record_type_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py index a3b336cd..ee624dae 100644 --- a/tests/test_automated/integration/tasks/test_url_record_type_task.py +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -6,8 +6,8 @@ from collector_db.models import URLMetadata from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.enums import RecordType, BatchStatus -from helpers.DBDataCreator import DBDataCreator -from helpers.assert_functions import assert_database_has_no_tasks +from tests.helpers.DBDataCreator import DBDataCreator +from tests.helpers.assert_functions import assert_database_has_no_tasks from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier @pytest.mark.asyncio From 42af80a4f70787045815f1925036e8a35ff3706e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:28:12 -0500 Subject: [PATCH 025/182] Fix error in import routing --- .../integration/tasks/test_url_relevancy_huggingface_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index 1cab4ee5..abf86cda 100644 --- a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -7,7 +7,7 @@ from collector_db.enums import ValidationStatus, ValidationSource from collector_db.models import URLMetadata, Task from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator -from helpers.assert_functions import assert_database_has_no_tasks +from tests.helpers.assert_functions import assert_database_has_no_tasks from hugging_face.HuggingFaceInterface import HuggingFaceInterface From 49bb5a6d04526e4e9416e284c846ee0910b6228c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:33:20 -0500 Subject: [PATCH 026/182] Fix error in import routing --- tests/test_automated/integration/tasks/test_url_html_task.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py index ff608b66..7674113f 100644 --- a/tests/test_automated/integration/tasks/test_url_html_task.py +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -5,11 +5,10 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType -from collector_db.models import Task from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.enums import BatchStatus -from helpers.DBDataCreator import DBDataCreator -from helpers.assert_functions import assert_database_has_no_tasks +from tests.helpers.DBDataCreator import DBDataCreator +from tests.helpers.assert_functions import assert_database_has_no_tasks from html_tag_collector.DataClassTags import ResponseHTMLInfo from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache From 3e6c0b451f07145d275eb8a18a8cd9287e06b85e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:39:02 -0500 Subject: [PATCH 027/182] Fix error in import routing --- tests/test_automated/integration/api/test_task.py | 2 +- tests/test_automated/integration/tasks/test_example_task.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_automated/integration/api/test_task.py b/tests/test_automated/integration/api/test_task.py index 64fbe75d..d6e13b1f 100644 --- a/tests/test_automated/integration/api/test_task.py +++ b/tests/test_automated/integration/api/test_task.py @@ -1,7 +1,7 @@ import pytest from collector_db.enums import TaskType -from test_automated.integration.api.conftest import APITestHelper +from tests.test_automated.integration.api.conftest import APITestHelper async def task_setup(ath: APITestHelper) -> int: diff --git a/tests/test_automated/integration/tasks/test_example_task.py b/tests/test_automated/integration/tasks/test_example_task.py index 6e69bc89..f6f56521 100644 --- a/tests/test_automated/integration/tasks/test_example_task.py +++ b/tests/test_automated/integration/tasks/test_example_task.py @@ -5,7 +5,7 @@ from collector_db.enums import TaskType from core.classes.TaskOperatorBase import TaskOperatorBase from core.enums import BatchStatus -from helpers.DBDataCreator import DBDataCreator +from tests.helpers.DBDataCreator import DBDataCreator class ExampleTaskOperator(TaskOperatorBase): From 13eae1b396f49a13af9335fcf4c1a23d5799106f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:46:54 -0500 Subject: [PATCH 028/182] Fix error in import routing --- tests/test_alembic/test_revisions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 1a5516c0..22a83496 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -15,7 +15,7 @@ from sqlalchemy import text -from test_alembic.helpers import columns_in_table +from tests.test_alembic.helpers import columns_in_table from tests.test_alembic.helpers import get_enum_values, table_creation_check From b64838279fc4365ae36d5ea4ed0075e2fe89c2fe Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 12:55:57 -0500 Subject: [PATCH 029/182] Comment out `.env` copy command --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0f241e7c..0410d458 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,4 +38,4 @@ EXPOSE 80 RUN chmod +x execute.sh # Use the below for ease of local development, but remove when pushing to GitHub # Because there is no .env file in the repository (for security reasons) -COPY .env ./.env +#COPY .env ./.env From 776671e7d97833785bb54c0c6cfccdf9873f696b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 29 Jan 2025 13:17:45 -0500 Subject: [PATCH 030/182] Convert dockerfile to slim package --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0410d458..e820fa66 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Dockerfile for Source Collector FastAPI app -FROM python:3.12.8 +FROM python:3.12.8-slim # Set working directory WORKDIR /app From c2601a32f06b5771977635b8e2534e93fc38b7ef Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 30 Jan 2025 09:24:55 -0500 Subject: [PATCH 031/182] Remove unused files --- annotation_pipeline/README.md | 43 --- annotation_pipeline/config.ini | 19 -- annotation_pipeline/data/batch_info.csv | 4 - annotation_pipeline/data/cache.json | 15 - .../urls_2024-08-16_15-18-09.csv | 3 - .../urls_2024-08-20_14-07-03.csv | 23 -- annotation_pipeline/populate_labelstudio.py | 256 ------------------ annotation_pipeline/record_types.txt | 36 --- annotation_pipeline/requirements.txt | 13 - .../muckrock/muckrock_ml_labeler.py | 80 ------ .../manual/label_studio_interface/__init__.py | 0 ...test_label_studio_interface_integration.py | 73 ----- 12 files changed, 565 deletions(-) delete mode 100644 annotation_pipeline/README.md delete mode 100644 annotation_pipeline/config.ini delete mode 100644 annotation_pipeline/data/batch_info.csv delete mode 100644 annotation_pipeline/data/cache.json delete mode 100644 annotation_pipeline/data/tag_collector/urls_2024-08-16_15-18-09.csv delete mode 100644 annotation_pipeline/data/tag_collector/urls_2024-08-20_14-07-03.csv delete mode 100644 annotation_pipeline/populate_labelstudio.py delete mode 100644 annotation_pipeline/record_types.txt delete mode 100644 annotation_pipeline/requirements.txt delete mode 100644 source_collectors/muckrock/muckrock_ml_labeler.py delete mode 100644 tests/manual/label_studio_interface/__init__.py delete mode 100644 tests/manual/label_studio_interface/test_label_studio_interface_integration.py diff --git a/annotation_pipeline/README.md b/annotation_pipeline/README.md deleted file mode 100644 index a6d7a1e4..00000000 --- a/annotation_pipeline/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Annotation Pipeline - -This Python script automates the process of crawling for relevant URLs, scraping HTML content from those pages, formatting the data as Label Studio tasks, and uploading them to Label Studio for annotation. - -## Features - -- **Common Crawl Integration**: Initiates the Common Crawl script to crawl for relevant URLs based on specified parameters such as Common Crawl ID, URL type, keyword, and number of pages to process. - -- **HTML Tag Collector**: Collects HTML tags from the crawled URLs using the tag collector script. - -- **Label Studio Tasks**: Formats the collected data into tasks suitable for Label Studio annotation, including pre-annotation support for assumed record types. - -- **Upload to Label Studio**: Uploads the tasks to Label Studio for review and annotation. - -## Setup - -1. Create venv and install Python dependencies (if not done previously) - (assuming these are run within the annotation_pipeline/ folder): - - `python -m venv annotation-pipeline-env` - - `source annotation-pipeline-env` - - `pip install -r requirements.txt` - -2. Setup Environment variables in data_source_identification/.env - - HUGGINGFACE_ACCESS_TOKEN=... - - LABEL_STUDIO_ACCESS_TOKEN=... - - LABEL_STUDIO_PROJECT_ID=... - - LABEL_STUDIO_ORGANIZATION_ID=... - -## Usage - -Run from the parent directory (data-source-identification/) - -The output logs from common crawl will be stored in `annotation_pipeline/data` by default. This can be modified by editing the `annotation_pipeline/config.ini` file. - -`python annotation_pipeline/populate_labelstudio.py common_crawl_id url keyword --pages num_pages [--record-type record_type]` - -- `common_crawl_id`: ID of the Common Crawl Corpus to search -- `url`: Type of URL to search for (e.g. *.gov for all .gov domains). -- `keyword`: Keyword that must be matched in the full URL -- `--pages num_pages`: Number of pages to search -- `--record-type record_type` (optional): Assumed record type for pre-annotation. - -e.g. `python annotation_pipeline/populate_labelstudio.py CC-MAIN-2024-10 '*.gov' arrest --pages 2 --record-type 'Arrest Records'` diff --git a/annotation_pipeline/config.ini b/annotation_pipeline/config.ini deleted file mode 100644 index 6f2deb96..00000000 --- a/annotation_pipeline/config.ini +++ /dev/null @@ -1,19 +0,0 @@ -# This configuration file contains default settings for the Common Crawler application. -# Settings can be modified to suit different environments or testing needs. - -[DEFAULT] -# Filename for the cache. Stores which pages have been crawled -# at which combinations of index, url search term, and keyword -# to avoid re-crawling them. -cache_filename = cache - -# Directory where data files (both cache and output) are stored. -# Change as needed for different environments. -# Path is relative from working directory that executes common_crawler/main.py -data_dir = annotation_pipeline/data - -# Filename for the output CSV containing crawled URLs. -output_filename = urls - -# Name of the huggingface repo -huggingface_repo_id = PDAP/unlabeled-urls diff --git a/annotation_pipeline/data/batch_info.csv b/annotation_pipeline/data/batch_info.csv deleted file mode 100644 index 35b2c19e..00000000 --- a/annotation_pipeline/data/batch_info.csv +++ /dev/null @@ -1,4 +0,0 @@ -Datetime,Source,Count,Keywords,Notes,Filename -2024-08-12 16:31:20.362180,Common Crawl,0,*.com - police,"CC-MAIN-2024-14, 10 pages, starting at 1",urls_2024-08-12_16-31-20 -2024-08-16 15:18:09.405734,Common Crawl,2,*.com - police,"CC-MAIN-2024-30, 2 pages, starting at 1",urls_2024-08-16_15-18-09 -2024-08-20 14:07:03.339044,Common Crawl,22,*.gov - police,"CC-MAIN-2024-30, 2 pages, starting at 1",urls_2024-08-20_14-07-03 diff --git a/annotation_pipeline/data/cache.json b/annotation_pipeline/data/cache.json deleted file mode 100644 index 066b4285..00000000 --- a/annotation_pipeline/data/cache.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "CC-MAIN-2024-14": { - "*.com": { - "police": 1 - } - }, - "CC-MAIN-2024-30": { - "*.com": { - "police": 2 - }, - "*.gov": { - "police": 2 - } - } -} \ No newline at end of file diff --git a/annotation_pipeline/data/tag_collector/urls_2024-08-16_15-18-09.csv b/annotation_pipeline/data/tag_collector/urls_2024-08-16_15-18-09.csv deleted file mode 100644 index b7b488cd..00000000 --- a/annotation_pipeline/data/tag_collector/urls_2024-08-16_15-18-09.csv +++ /dev/null @@ -1,3 +0,0 @@ -url,url_path,html_title,meta_description,root_page_title,http_response,h1,h2,h3,h4,h5,h6,div_text,batch_id -https://001-adult-toys-n-sex-dolls.com/video/3879/stoya-gets-investigated-by-the-police/,video/3879/stoya-gets-investigated-by-the-police,Stoya gets investigated by the police | One truly amazing adult page with everything included,"Stoya tweeted her accusations, and neither porn star James.",Sex dolls porn clips | One truly amazing adult page with everything included,200,"[""Stoya gets investigated by the police""]","[""Related Videos""]",[],[],[],[],Sex dolls Home Models Categories Sex dolls Home Models Categories Home Models Categories ,2024-08-16 15:18:09 -https://001-adult-toys-n-sex-dolls.com/video/39592/policeman-helps-out-jasmine-jae-glaze-up-her-filth/,video/39592/policeman-helps-out-jasmine-jae-glaze-up-her-filth,Policeman helps out Jasmine Jae glaze up her filth | One truly amazing adult page with everything included,Please rest assured that we are working hard to reach out to.,Sex dolls porn clips | One truly amazing adult page with everything included,200,"[""Policeman helps out Jasmine Jae glaze up her filth""]","[""Related Videos""]",[],[],[],[],Sex dolls Home Models Categories Sex dolls Home Models Categories Home Models Categories ,2024-08-16 15:18:09 diff --git a/annotation_pipeline/data/tag_collector/urls_2024-08-20_14-07-03.csv b/annotation_pipeline/data/tag_collector/urls_2024-08-20_14-07-03.csv deleted file mode 100644 index 66108893..00000000 --- a/annotation_pipeline/data/tag_collector/urls_2024-08-20_14-07-03.csv +++ /dev/null @@ -1,23 +0,0 @@ -url,url_path,html_title,meta_description,root_page_title,http_response,h1,h2,h3,h4,h5,h6,div_text,batch_id -https://origin-www.acquisition.gov/dfars/252.225-7029-acquisition-uniform-components-afghan-military-or-afghan-national-police.,dfars/252.225-7029-acquisition-uniform-components-afghan-military-or-afghan-national-police.,252.225-7029 Acquisition of Uniform Components for Afghan Military or Afghan National Police. | Acquisition.GOV,,Home | Acquisition.GOV,200,"[""DFARS"", ""252.225-7029 Acquisition of Uniform Components for Afghan Military or Afghan National Police.""]","[""Main navigation"", ""Breadcrumb"", ""DFARS Parts"", ""Regulations DFARS Menu"", ""DFARS Appendix"", ""Regulations DFARS Appendix Menu"", ""Upper Footer Menu""]","[""FAR""]","[""Favorite"", ""X""]",[],[],,2024-08-20 14:07:03 -https://acworth-ga.gov/events/category/police-event/,events/category/police-event,"Events from November 16 – November 16 › police-event › – City of Acworth, GA",,"Home - City of Acworth, GA",200,"[""police-event""]","[""Events Search and Views Navigation"", ""November 2024""]","[""Event Views Navigation""]",[],[],[],Open toolbar Accessibility Tools Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset ,2024-08-20 14:07:03 -https://acworth-ga.gov/events/tag/police-department/,events/tag/police-department,"Events from November 16 – November 16 – City of Acworth, GA",,"Home - City of Acworth, GA",200,"[""police-department""]","[""Events Search and Views Navigation"", ""November 2024""]","[""Event Views Navigation""]",[],[],[],Open toolbar Accessibility Tools Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset ,2024-08-20 14:07:03 -https://acworth-ga.gov/faq-items/i-got-a-ticket-from-an-acworth-police-officer-what-do-i-do/,faq-items/i-got-a-ticket-from-an-acworth-police-officer-what-do-i-do,"I got a ticket from an Acworth Police officer. What do I do? - City of Acworth, GA",There is a court date listed on the citation. This is an arraignment date. You will be asked to enter a plea at this time. Please be on time for your appointed court date as the judge will give you a lot of valuable information at the opening of the court session.,"Home - City of Acworth, GA",200,"[""I got a ticket from an Acworth Police officer. What do I do?""]",[],[],"[""Please Feel Free to Share This Story:""]",[],[],"Open toolbar Accessibility Tools Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Previous Next I got a ticket from an Acworth Police officer. What do I do? There is a court date listed on the citation. This is an arraignment date. You will be asked to enter a plea at this time. Please be on time for your appointed court date as the judge will give you a lot of valuable information at the opening of the court session. Mike Brooks 2024-07-29T13:52:05-04:00 September 15, 2023 | Please Feel Free to Share This Story: Facebook X LinkedIn Pinterest Email 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2023 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Laserfiche | Login Previous Next I got a ticket from an Acworth Police officer. What do I do? There is a court date listed on the citation. This is an arraignment date. You will be asked to enter a plea at this time. Please be on time for your appointed court date as the judge will give you a lot of valuable information at the opening of the court session. Mike Brooks 2024-07-29T13:52:05-04:00 September 15, 2023 | Please Feel Free to Share This Story: Facebook X LinkedIn Pinterest Email 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2023 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Laserfiche | Login ",2024-08-20 14:07:03 -https://acworth-ga.gov/presentation-introducing-three-new-civilian-members-of-the-acworth-police-department/,presentation-introducing-three-new-civilian-members-of-the-acworth-police-department,"Presentation Introducing Three New Civilian Members of the Acworth Police Department - City of Acworth, GA","At the Thursday, May 18 regular city council meeting, Chief Evans introduced three new civilian members of the Acworth Police Department. Macey Williams serves as Crime Analyst, Madison Harrison serves as Evidence Tech, and Emily Hall serves as Victim Advocate. These employees play an integral role in assisting officers with solving cases, and Chief Evans","Home - City of Acworth, GA",200,[],[],"[""Presentation Introducing Three New Civilian Members of the Acworth Police Department""]","[""Share this on Social Media""]",[],[],"Open toolbar Accessibility Tools Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset PHONE DIRECTORY RESOURCES Search for: Search Button PHONE DIRECTORY RESOURCES Search for: Search Button NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Administration Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Administration Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH Presentation Introducing Three New Civilian Members of the Acworth Police Department Published On: May 18, 2023 At the Thursday, May 18 regular city council meeting, Chief Evans introduced three new civilian members of the Acworth Police Department. Macey Williams serves as Crime Analyst, Madison Harrison serves as Evidence Tech, and Emily Hall serves as Victim Advocate. These employees play an integral role in assisting officers with solving cases, and Chief Evans was pleased to share with Mayor Allegood and Acworth’s Aldermen how important their new positions are in supporting both the Acworth Police Department and the community as a whole. Share this on Social Media 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2023 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Laserfiche | Login ",2024-08-20 14:07:03 -https://acworth-ga.gov/team_member/police-department-records/,team_member/police-department-records,"Police Department Records - City of Acworth, GA",,"Home - City of Acworth, GA",200,"[""Police Department Records""]",[],[],"[""Please Feel Free to Share This Story:""]",[],[],"Open toolbar Accessibility Tools Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset Accessibility Tools Increase Text Increase Text Decrease Text Decrease Text Grayscale Grayscale High Contrast High Contrast Negative Contrast Negative Contrast Light Background Light Background Links Underline Links Underline Readable Font Readable Font Reset Reset PHONE DIRECTORY RESOURCES Search for: Search Button PHONE DIRECTORY RESOURCES Search for: Search Button NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Administration Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH NEWS DEPARTMENTS GOVERNANCE & DEVELOPMENT Development Clerks Office Court Services DDA, Tourism, and Historic Preservation OPERATIONS Parks, Recreation, and Community Resources Power, Public Works, and Stormwater SUPPORT SERVICES Administration Customer Service Human Resources Finance Information Technology PUBLIC SAFETY Acworth Police RESIDENTS Public Art Master Plan Application for Boards & Commissions Board of Aldermen Customer Service Parks, Recreation, and Community Resources Historic Acworth Master Fee Schedule E-News Sign Up Online Payments BUSINESS Bids & Projects E-Verify Permits, Applications, & Ordinances City Code of Ordinances Master Fee Schedule Start a Business EVENTS VISIT ACWORTH Previous Next Police Department Records Mike Brooks 2023-12-18T10:02:20-05:00 December 18, 2023 | Please Feel Free to Share This Story: Facebook X LinkedIn Pinterest Email 4415 Center Street, Acworth GA 30101 Phone Directory Contact Us © 2023 City of Acworth Acworth is located in the foothills of the North Georgia mountains and is nestled along the banks of Lake Acworth and Lake Allatoona, hence its nickname “The Lake City.” The city boasts a rich history, a charming downtown, abundant outdoor recreational activities, a vibrant restaurant scene, and an active festival and events calendar. Acworth is one of the best, family-friendly destinations in the Atlanta region. Come discover why You’re Welcome in Acworth! ESS | Webmail | Handbook | Laserfiche | Login ",2024-08-20 14:07:03 -https://www.ada.gov/_pages/redirects/illinois_state_police/,_pages/redirects/illinois_state_police,SETTLEMENT AGREEMENT BETWEEN THE UNITED STATES AND ILLINOIS STATE POLICE,"The ADA Home Page provides access to Americans with Disabilities Act (ADA) regulations for businesses and State and local governments, technical assistance materials, ADA Standards for Accessible Design, links to Federal agencies with ADA responsibilities and information, updates on new ADA requirements, streaming video, information about Department of Justice ADA settlement agreements, consent decrees, and enforcement activities and access to Freedom of Information Act (FOIA) ADA material",The Americans with Disabilities Act | ADA.gov,200,"[""SETTLEMENT AGREEMENT BETWEEN THE UNITED STATES \n OF AMERICA AND AND ILLINOIS STATE POLICE""]","[""I. BACKGROUND"", ""II. GENERAL AGREEMENT"", ""III. SPECIFIC REMEDIAL RELIEF"", ""IV. IMPLEMENTATION AND ENFORCEMENT""]",[],[],[],[],,2024-08-20 14:07:03 -https://www.ada.gov/policeinfo.htm,policeinfo.htm,American's with Disabilities Act: Information for Law Enforcement,"The ADA Home Page provides access to Americans with Disabilities Act (ADA) regulations for businesses and State and local governments, technical assistance materials, ADA Standards for Accessible Design, links to Federal agencies with ADA responsibilities and information, updates on new ADA requirements, streaming video, information about Department of Justice ADA settlement agreements, consent decrees, and enforcement activities and access to Freedom of Information Act (FOIA) ADA material",The Americans with Disabilities Act | ADA.gov,200,"[""Americans with Disabilites Act Information for Law Enforcement""]","[""PUBLICATIONS""]",[],[],[],[],"Americans with Disabilites Act Information for Law Enforcement How do you interview a witness who is deaf?  How do you assist a person who is having a seizure?  How do you transport a suspect who uses a wheelchair?  Under the Americans with Disabilities Act (ADA), people who have disabilities are entitled to the same services law enforcement provides to anyone else. They may not be excluded or segregated from services, be denied services, or otherwise be treated differently than other people.  The following compliance assistance materials will help state and local law enforcement officers understand how to interact with victims, witnesses, suspects, and others who have disabilities. PUBLICATIONS Communicating with People Who Are Deaf or Hard of Hearing:  ADA Guide for Law Enforcement Officers - This 8-panel pocket guide provides basic information for officers about communicating effectively with people who are deaf or hard of hearing. Guide for Officers Model Policy for Law Enforcement on Communicating with People Who Are Deaf or Hard of Hearing - This 4-page document serves as a model for law enforcement agencies when adopting a policy on effective communication with people who are deaf or hard of hearing.  Agencies are encouraged to download and adapt the policy to suit their needs. Model Policy (PDF) | Model Policy (HTML) Commonly Asked Questions about the Americans with Disabilities Act and Law Enforcement - This 12-page fact sheet answers frequent questions about the ADA and its effect on law enforcement services involving people with disabilities. Commonly Asked Questions (PDF) | Commonly Asked Questions (HTML) Questions and Answers: The Americans with Disabilities Act and Hiring Police Officers - This 5-page fact sheet answers frequent questions about the ADA and its impact on law enforcement officers with disabilities. Questions and Answers (PDF) | Questions and Answers (HTML) Additional ADA information for state and local government agencies including law enforcement ADA Regulations | Other Publications December 1, 2008 ",2024-08-20 14:07:03 -https://www.ada.gov/policevideo/policedialupgallery.htm,policevideo/policedialupgallery.htm,,,The Americans with Disabilities Act | ADA.gov,404,,,,,,,,2024-08-20 14:07:03 -https://www.adamscountypa.gov/departments/victimwitness/police-departments,departments/victimwitness/police-departments,Adams County PA - Police Departments,,Adams County PA - Official Website,200,"[""Police Departments"", ""Police Departments""]",[],[],[],[],[],[Skip to Content] ,2024-08-20 14:07:03 -https://www.adamscountypa.gov/police-14c9658f036316bb91289647492de2ae/narema/contact-us,police-14c9658f036316bb91289647492de2ae/narema/contact-us,Adams County PA - Contact Us,,Adams County PA - Official Website,200,"[""Contact Us"", ""Contact Us""]","[""Emergency Dial 9-1-1  -   Non-Emergency 717-334-8603""]",[],[],[],[],[Skip to Content] ,2024-08-20 14:07:03 -https://www.adamscountypa.gov/police/earpd/calendar/police-department-commission-meeting,police/earpd/calendar/police-department-commission-meeting,Adams County PA - Police Department Commission Meeting,,Adams County PA - Official Website,200,"[""Police Department Commission Meeting""]",[],[],[],[],[],"[Skip to Content] Search ✖ Home Services Locations Powered by Translate Log in Register ✖ Commissioners Board of Commissioners Commissioners Office Elections and Voters Registration Human Resources Solicitor Tax Services Veterans Affairs County Services Board of Commissioners Meetings County Budget Employment Opportunites Open Records Right to Know Parcel Locator - Interactive Mapping Pay Delinquent Taxes Register to Vote Veterans Services Controller's Fraud Hotline About Adams County Adams County Broadband Taskforce Adams County Profile Adams County School Districts Adams County Plans, Studies, and Publications Adams County Tax Collectors Adams Economic Alliance Land Conservancy of Adams County Penn State Extension Office of Adams County Courts 51st Judicial District Court Court of Common Pleas Court Administration Magisterial District Judges Magisterial District Judges Home District Court 51-3-01 District Court 51-3-02 District Court 51-3-03 District Court 51-3-04 Court Departments Criminal Justice Advisory Board (CJAB) Domestic Relations Section Law Library Operational Services Probation Services County Government County Administration Adult Correction Complex Building and Maintenance​​ Children and Youth Services​​ Conservation District​ Department of Emergency Services​ Elections and Voter Registration​ Human Resources Information Technology Office of Budget and Purchasing Office of Planning and Development​ ​ Protective Services Public Defender Security Solicitor Tax Services​​​ Veterans Affairs Victim Witness Elected Officials Clerk of Court​ ​ Clerk of Orphans' Court ​ ​ Controller​​ ​ Coroner​​ ​ District Attorney ​ Prothonotary Recorder of Deeds ​ Register of Wills​ ​ Sheriff ​ Treasurer ​ Municipalities Boroughs Abbottstown Borough Arendtsville Borough Bendersville Borough Biglerville Borough Bonneauville Borough Carroll Valley Borough East Berlin Borough Fairfield Borough Gettysburg Borough Littlestown Borough New Oxford Borough McSherrystown Borough York Springs Borough Townships Berwick Township Butler Township Conewago Township Cumberland Township Franklin Township Freedom Township Germany Township Hamiltonban Township Hamilton Township Highland Township Huntington Township Latimore Township Liberty Township Menallen Township Mt. Joy Township Mt. Pleasant Township Oxford Township Reading Township Straban Township Tyrone Township Union Township ​Associations Council of Government Association of Borough Officials Association of Township Officials York/Adams MH IDD Program Adams County Volunteer Emergency Services Northern Adams Regional Emergency Management Agency Police Department (EARPD) Search Search Police Department Commission Meeting Start Date: Sunday, January 1, 2023 End Date: Sunday, December 31, 2023 Location: Eastern Adams Regional Police Department - 110 N Berlin Road Start Time: 4:00 PM Back to previous page Resources County by Location County Coat of Arms Privacy Statement Terms of Use Navigation Commissioners County Government Courts Municipalities Services Courts Self-Help Center Election Resources Employment Office of Open Records Tax Services​​​ Copyright 2024 ",2024-08-20 14:07:03 -https://www.adamscountypa.gov/police/earpd/calendar/police-department-commission-meeting/07-18-2023-04-00-00-pm-police-department-commission-meeting,police/earpd/calendar/police-department-commission-meeting/07-18-2023-04-00-00-pm-police-department-commission-meeting,Adams County PA - 07/18/2023 04:00:00 PM Police Department Commission Meeting,,Adams County PA - Official Website,200,"[""Police Department Commission Meeting""]",[],[],[],[],[],"[Skip to Content] Search ✖ Home Services Locations Powered by Translate Log in Register ✖ Commissioners Board of Commissioners Commissioners Office Elections and Voters Registration Human Resources Solicitor Tax Services Veterans Affairs County Services Board of Commissioners Meetings County Budget Employment Opportunites Open Records Right to Know Parcel Locator - Interactive Mapping Pay Delinquent Taxes Register to Vote Veterans Services Controller's Fraud Hotline About Adams County Adams County Broadband Taskforce Adams County Profile Adams County School Districts Adams County Plans, Studies, and Publications Adams County Tax Collectors Adams Economic Alliance Land Conservancy of Adams County Penn State Extension Office of Adams County Courts 51st Judicial District Court Court of Common Pleas Court Administration Magisterial District Judges Magisterial District Judges Home District Court 51-3-01 District Court 51-3-02 District Court 51-3-03 District Court 51-3-04 Court Departments Criminal Justice Advisory Board (CJAB) Domestic Relations Section Law Library Operational Services Probation Services County Government County Administration Adult Correction Complex Building and Maintenance​​ Children and Youth Services​​ Conservation District​ Department of Emergency Services​ Elections and Voter Registration​ Human Resources Information Technology Office of Budget and Purchasing Office of Planning and Development​ ​ Protective Services Public Defender Security Solicitor Tax Services​​​ Veterans Affairs Victim Witness Elected Officials Clerk of Court​ ​ Clerk of Orphans' Court ​ ​ Controller​​ ​ Coroner​​ ​ District Attorney ​ Prothonotary Recorder of Deeds ​ Register of Wills​ ​ Sheriff ​ Treasurer ​ Municipalities Boroughs Abbottstown Borough Arendtsville Borough Bendersville Borough Biglerville Borough Bonneauville Borough Carroll Valley Borough East Berlin Borough Fairfield Borough Gettysburg Borough Littlestown Borough New Oxford Borough McSherrystown Borough York Springs Borough Townships Berwick Township Butler Township Conewago Township Cumberland Township Franklin Township Freedom Township Germany Township Hamiltonban Township Hamilton Township Highland Township Huntington Township Latimore Township Liberty Township Menallen Township Mt. Joy Township Mt. Pleasant Township Oxford Township Reading Township Straban Township Tyrone Township Union Township ​Associations Council of Government Association of Borough Officials Association of Township Officials York/Adams MH IDD Program Adams County Volunteer Emergency Services Northern Adams Regional Emergency Management Agency Police Department (EARPD) Search Search Police Department Commission Meeting Start Date: Tuesday, July 18, 2023 End Date: Tuesday, July 18, 2023 Location: Eastern Adams Regional Police Department - 110 N Berlin Road Start Time: 4:00 PM Back to previous page Resources County by Location County Coat of Arms Privacy Statement Terms of Use Navigation Commissioners County Government Courts Municipalities Services Courts Self-Help Center Election Resources Employment Office of Open Records Tax Services​​​ Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/brian-weikert,departments/contactuspolice/policecontacts/brian-weikert,Adams County Municipality Cumberland Township - Brian Weikert,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Extension: 450 Direct Phone: 717-334-6485 Ext. 450 Brian Weikert Patrolman Email: bweikert@cumberlandtwppa.gov Email: Extension: 450 Extension: Direct Phone: 717-334-6485 Ext. 450 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/daniel-barbagelle,departments/contactuspolice/policecontacts/daniel-barbagelle,Adams County Municipality Cumberland Township - Daniel Barbagello,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Extension: 404 Direct Phone: 717-334-6485 Ext. 404 Daniel Barbagello Patrolman First Class Email: dbarbagello@cumberlandtwppa.gov Email: Extension: 404 Extension: Direct Phone: 717-334-6485 Ext. 404 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/eric-yost,departments/contactuspolice/policecontacts/eric-yost,Adams County Municipality Cumberland Township - Eric Yost,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Extension: 4400 Direct Phone: 717-334-6485 Ext. 4400 Eric Yost Patrolman Email: eyost@cumberlandtwppa.gov Email: Extension: 4400 Extension: Direct Phone: 717-334-6485 Ext. 4400 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/josh-goodling,departments/contactuspolice/policecontacts/josh-goodling,Adams County Municipality Cumberland Township - Josh Goodling,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Extension: 407 Direct Phone: 717-334-6485 Ext. 407 Josh Goodling Sergeant Email: jgoodling@cumberlandtwppa.gov Email: Extension: 407 Extension: Direct Phone: 717-334-6485 Ext. 407 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/joshua-rosenberger,departments/contactuspolice/policecontacts/joshua-rosenberger,Adams County Municipality Cumberland Township - Joshua Rosenberger,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Extension: 402 Direct Phone: 717-334-6485 Ext. 402 Joshua Rosenberger Sergeant Email: jrosenberger@cumberlandtwppa.gov Email: Extension: 402 Extension: Direct Phone: 717-334-6485 Ext. 402 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/lane-hartley,departments/contactuspolice/policecontacts/lane-hartley,Adams County Municipality Cumberland Township - Lane Hartley,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Extension: 408 Direct Phone: 717-334-6485 Ext. 408 Lane Hartley Patrolman Email: lhartley@cumberlandtwppa.gov Email: Extension: 408 Extension: Direct Phone: 717-334-6485 Ext. 408 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/departments/contactuspolice/policecontacts/ryan-eiker,departments/contactuspolice/policecontacts/ryan-eiker,Adams County Municipality Cumberland Township - Ryan Eiker,,Adams County Municipality Cumberland Township - Official Website,200,[],[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Extension: 403 Direct Phone: 717-334-6485 Ext. 403 Ryan Eiker Patrolman First Class Email: reiker@cumberlandtwppa.gov Email: Extension: 403 Extension: Direct Phone: 717-334-6485 Ext. 403 Direct Phone: Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/home/newsinformation/now-hiring-police-department,home/newsinformation/now-hiring-police-department,Adams County Municipality Cumberland Township - Official Website,Official Website,Adams County Municipality Cumberland Township - Official Website,200,"[""Now Hiring - Police Department""]",[],[],[],[],[],"[Skip to Content] Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Now Hiring - Police Department News Date: Tuesday, June 18, 2024 Full Time Police Cadet Full Time Officer Application Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Search ✖ Log in Register ✖ Home Township Meetings Township Services Township Land Use Search Search Search ✖ Log in Register ✖ Log in Register Home Township Meetings Township Services Township Land Use Search Search Search Now Hiring - Police Department News Date: Tuesday, June 18, 2024 Full Time Police Cadet Full Time Officer Application Now Hiring - Police Department Now Hiring - Police Department News Date: Tuesday, June 18, 2024 News Date: Tuesday, June 18, 2024 News Date: Tuesday, June 18, 2024 Full Time Police Cadet Full Time Officer Application Full Time Police Cadet Full Time Officer Application Full Time Police Cadet Full Time Officer Application Full Time Police Cadet Full Time Officer Application Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Cumberland Township 1370 Fairfield Road Gettysburg, PA 17325 Phone - 717-334-6485 Fax - 717-334-3632 Copyright 2024 Copyright 2024 Copyright 2024 Copyright 2024 ",2024-08-20 14:07:03 -https://cumberland.adamscountypa.gov/home/newsinformation/press-release-for-police-department-donations,home/newsinformation/press-release-for-police-department-donations,,,Adams County Municipality Cumberland Township - Official Website,404,,,,,,,,2024-08-20 14:07:03 diff --git a/annotation_pipeline/populate_labelstudio.py b/annotation_pipeline/populate_labelstudio.py deleted file mode 100644 index 49c673c2..00000000 --- a/annotation_pipeline/populate_labelstudio.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -This Python script automates the process of crawling Common Crawl Corpus for relevant URLs, -scraping HTML content from those pages, -formatting the data as Label Studio tasks, -and uploading them to Label Studio for annotation. -""" - -import argparse -import configparser -import os -import subprocess -import sys -from http import HTTPStatus - -import pandas as pd -from huggingface_hub import hf_hub_download - -# The below code sets the working directory to be the root of the entire repository for module imports -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from label_studio_interface.LabelStudioConfig import LabelStudioConfig -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager - -def run_subprocess(terminal_command: str): - """ - Runs subprocesses (e.g. common crawl and html tag collector) and handles their outputs + errors - """ - - process = subprocess.Popen(terminal_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1) - - with process.stdout, process.stderr: - for line in process.stdout: - print(line, end='') - for line in process.stderr: - print(line, end='') - - return_code = process.wait() - - stdout, stderr = process.communicate() - - return return_code, stdout, stderr - -def run_common_crawl(common_crawl_id: str, url: str, search_term: str, num_pages: str): - """ - Prompts terminal to run common crawl script provided the following: - Args: SEE def process_crawl() - - See Common Crawl Documentation @ https://github.com/Police-Data-Accessibility-Project/data-source-identification/blob/main/common_crawler/README.md - - CSV of crawled URL's uploaded to HuggingFace - """ - - common_crawl = f"python common_crawler/main.py {common_crawl_id} '{url}' {search_term} --config annotation_pipeline/config.ini --pages {num_pages}" - - return_code, stdout, stderr = run_subprocess(common_crawl) - - return return_code, stdout, stderr - -def run_tag_collector(filename: str): - """ - Prompts terminal to run tag collector on crawled URL's - filename: name of csv containing crawled URL's - - CSV of URL's + collected tags saved in ./labeled-source-text.csv - """ - tag_collector = f"python3 html_tag_collector/collector.py annotation_pipeline/data/{filename} --render-javascript" - - return_code, stdout, stderr = run_subprocess(tag_collector) - - return return_code, stdout, stderr - -def csv_to_label_studio_tasks(csv_file_path: str, batch_id: str, output_name: str, record_type: str = None) -> list[dict]: - """ - Formats CSV into list[dict] with "data" key as labelstudio expects - csv_file_path: path to csv with labeled source text - batch_id: timestamp to append to all URL's in batch - output_name: saves tag_collected CSV + batch_info in data/tag_collector/{output_name} - """ - df = pd.read_csv(csv_file_path) - df['batch_id'] = [batch_id] * len(df) - df = df.fillna('') - os.makedirs("annotation_pipeline/data/tag_collector/", exist_ok=True) - df.to_csv("annotation_pipeline/data/tag_collector/" + output_name.replace("urls/", "", 1), index=False) - - #remove labeled-source-text.csv (updated and written to data/tag_collector) - if os.path.exists(csv_file_path): - os.remove(csv_file_path) - - tasks = [] - - if record_type: - for _, row in df.iterrows(): - task_data = row.to_dict() - task_predictions = { - "model_version": "record-type prediction", - "result": [ - { - "from_name": "record-type", - "to_name": "url", - "type": "choices", - "value": { - "choices": [record_type] - } - } - ] - } - - tasks.append({"data": task_data, "predictions": [task_predictions]}) - else: - tasks = [{"data": row.to_dict()} for _, row in df.iterrows()] - - return tasks - -def get_valid_record_types(file_path: str) -> set: - """ load file containing valid record types and return them as a set""" - with open(file_path, 'r') as file: - valid_record_types = {line.strip() for line in file} - return valid_record_types - -def get_huggingface_repo_id(config_file: str) -> str: - """ Returns HuggingFace REPO_ID (where unlabeled URLs are stashed) from config.ini file""" - - config = configparser.ConfigParser() - config.read(config_file) - - # Retrieve the huggingface_repo_id from the DEFAULT section - huggingface_repo_id = config['DEFAULT'].get('huggingface_repo_id') - - if huggingface_repo_id is None: - raise ValueError("huggingface_repo_id not found in the config file.") - - return huggingface_repo_id - -def process_crawl(common_crawl_id: str, url: str, search_term: str, num_pages: str) -> pd.Series: - """Initiated common crawl script and handles output for further processing - - Args: - common_crawl_id: string to specify which common crawl corpus to search - url: specify type of url to search for (e.g. *.gov for all .gov domains) - search_term: further refine search with keyword that must be matched in full URL - num_pages: number of pages to search (15,000 records per page) - - Returns: - batch_info (pd.Series): summary info of crawl, including filename of csv containing relevant URLs - """ - #run common crawl - crawl_return_code, crawl_stdout, crawl_stderr = run_common_crawl(common_crawl_id, url, search_term, num_pages) - - print(f"from populate label studio crawl error: crawl return {crawl_return_code}, crawl stdout {crawl_stdout}, crawl stderr {crawl_stderr}") - - #check success - if crawl_return_code != 0: - raise ValueError(f"Common crawl script failed:\n{crawl_stderr}") - - #print batch info to verify before continuing - batch_info = pd.read_csv("annotation_pipeline/data/batch_info.csv").iloc[-1] - print("Batch Info:\n" + f"{batch_info}") - - if(batch_info["Count"] == 0): - raise ValueError("Batch count is 0. Rerun to crawl more pages.") - - return batch_info - -def process_tag_collector(batch_info: pd.Series, FILENAME: str) -> str: - """ - Initiates tag collector script and creates a batch id for all samples - - Args: - batch_info (pd.Series): summary info for crawl - FILENAME (str): filename of csv to collect tags on - - Returns: - batch_id (str): a datetime stamp to track batches - """ - - #run tag collector - tag_collector_return_code, tag_collector_stdout, tag_collector_stderr = run_tag_collector(FILENAME) - - #check success - if tag_collector_return_code != 0: - raise ValueError(f"Tag collector script failed:\n{tag_collector_stderr}") - - #create batch_id from datetime (removes milliseconds) - datetime = batch_info["Datetime"] - batch_id = datetime[:datetime.find('.')] - - return batch_id - -def label_studio_upload(batch_id: str, FILENAME: str, record_type: str): - """ - Handles label studio task formatting and upload - """ - - #convert to label studio task format - data = csv_to_label_studio_tasks("labeled-source-text.csv", batch_id, FILENAME, record_type) - - # Load the configuration for the Label Studio API - config = LabelStudioConfig(".env") - if "REPLACE_WITH_YOUR_TOKEN" in config.authorization_token: - raise ValueError("Please replace the access token in .env with your own access token") - - # Create an API manager - api_manager = LabelStudioAPIManager(config) - - #import tasks - label_studio_response = api_manager.export_tasks_into_project(data) - - #check import success - if label_studio_response.status_code == HTTPStatus.CREATED: - labelstudio_url = api_manager.api_url_constructor.get_import_url().rstrip('/import') - print(f"Tasks successfully imported. Please access the project at {labelstudio_url} to perform review and annotation tasks") - else: - raise ValueError(f"Failed to import tasks. Response code: {label_studio_response.status_code}\n{label_studio_response.text}") - -def main(): - """ - This script automates the process of crawling for relevant URL's, - scraping HTML content from those pages, formatting the data as label studio tasks, - and uploading to label studio - """ - - parser = argparse.ArgumentParser(description='Process crawl arguments') - parser.add_argument('common_crawl_id', type=str, help='common crawl ID') - parser.add_argument('url', type=str, help='URL type to search for') - parser.add_argument('keyword', type=str, help='require this keyword in URL results') - parser.add_argument('--pages', type=str, required=True, help='number of pages to process') - parser.add_argument('--record-type', type=str, required=False, help='assumed record type for pre-annotation') - args = parser.parse_args() - - if args.record_type is not None: - valid_record_types = get_valid_record_types("annotation_pipeline/record_types.txt") - if args.record_type not in valid_record_types: - raise ValueError(f"Invalid record type: {args.record_type}. Must be one of {valid_record_types}") - return - - try: - # COMMON CRAWL - batch_info = process_crawl(args.common_crawl_id, args.url, args.keyword, args.pages) - #get urls from hugging face - REPO_ID = get_huggingface_repo_id("annotation_pipeline/config.ini") - FILENAME = "urls/" + batch_info["Filename"] + ".csv" - hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset", local_dir="annotation_pipeline/data/") - - # TAG COLLECTOR - batch_id = process_tag_collector(batch_info, FILENAME) - - # LABEL STUDIO UPLOAD - label_studio_upload(batch_id, FILENAME, args.record_type) - except ValueError as e: - print(f"Error: {e}") - return - -if __name__ == "__main__": - print("Running Annotation Pipeline...") - main() - diff --git a/annotation_pipeline/record_types.txt b/annotation_pipeline/record_types.txt deleted file mode 100644 index b12931d6..00000000 --- a/annotation_pipeline/record_types.txt +++ /dev/null @@ -1,36 +0,0 @@ -Accident Reports -Arrest Records -Calls for Service -Car GPS -Citations -Dispatch Logs -Dispatch Recordings -Field Contacts -Incident Reports -Misc Police Activity -Officer Involved Shootings -Stops -Surveys -Use of Force Reports -Vehicle Pursuits -Complaints & Misconduct -Daily Activity Logs -Training & Hiring Info -Personnel Records -Annual & Monthly Reports -Budgets & Finances -Contact Info & Agency Meta -Geographic -List of Data Sources -Policies & Contracts -Crime Maps & Reports -Crime Statistics -Media Bulletins -Records Request Info -Resources -Sex Offender Registry -Wanted Persons -Booking Reports -Court Cases -Incarceration Records -Poor Data Source \ No newline at end of file diff --git a/annotation_pipeline/requirements.txt b/annotation_pipeline/requirements.txt deleted file mode 100644 index 5ff85815..00000000 --- a/annotation_pipeline/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -pandas==2.1.4 -python-dotenv~=1.0.1 -argparse~=1.1 -huggingface-hub~=0.22.2 -requests~=2.31.0 -requests_html>=0.10.0 -lxml~=5.1.0 -pyppeteer>=2.0.0 -beautifulsoup4>=4.12.3 -bs4~=0.0.2 -tqdm>=4.64.1 -polars~=0.20.10 -urllib3~=1.26.18 diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py deleted file mode 100644 index 49af4794..00000000 --- a/source_collectors/muckrock/muckrock_ml_labeler.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Utilizes a fine-tuned model to label a dataset of URLs. -""" - -import argparse - -import pandas as pd -import torch -from transformers import AutoTokenizer, AutoModelForSequenceClassification - - -def load_dataset_from_command_line() -> pd.DataFrame: - parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") - parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file") - args = parser.parse_args() - return pd.read_csv(args.csv_file) - - -def create_combined_text_column(df: pd.DataFrame) -> None: - # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row - columns_to_combine = [ - "url_path", - "html_title", - "h1", - ] # Add other columns here as needed - df["combined_text"] = df[columns_to_combine].apply( - lambda row: " ".join(row.values.astype(str)), axis=1 - ) - - -def get_list_of_combined_texts(df: pd.DataFrame) -> list[str]: - # Convert the combined text into a list - return df["combined_text"].tolist() - - -def save_labeled_muckrock_dataset_to_csv(): - df.to_csv("labeled_muckrock_dataset.csv", index=False) - - -def create_predicted_labels_column(df: pd.DataFrame, predicted_labels: list[str]) -> None: - df["predicted_label"] = predicted_labels - - -def map_predictions_to_labels(model, predictions) -> list[str]: - labels = model.config.id2label - return [labels[int(pred)] for pred in predictions] - - -def get_predicted_labels(texts: list[str]) -> list[str]: - # Load the tokenizer and model - model_name = "PDAP/fine-url-classifier" - tokenizer = AutoTokenizer.from_pretrained(model_name) - - model = AutoModelForSequenceClassification.from_pretrained(model_name) - model.eval() - # Tokenize the inputs - inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") - # Perform inference - with torch.no_grad(): - outputs = model(**inputs) - # Get the predicted labels - predictions = torch.argmax(outputs.logits, dim=-1) - # Map predictions to labels - predicted_labels = map_predictions_to_labels(model=model, predictions=predictions) - - return predicted_labels - - -if __name__ == "__main__": - df = load_dataset_from_command_line() - # TODO: Check for existence of required columns prior to further processing - create_combined_text_column(df=df) - - texts = get_list_of_combined_texts(df=df) - - predicted_labels = get_predicted_labels(texts=texts) - # Add the predicted labels to the dataframe and save - create_predicted_labels_column(df=df, predicted_labels=predicted_labels) - - save_labeled_muckrock_dataset_to_csv() \ No newline at end of file diff --git a/tests/manual/label_studio_interface/__init__.py b/tests/manual/label_studio_interface/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/manual/label_studio_interface/test_label_studio_interface_integration.py b/tests/manual/label_studio_interface/test_label_studio_interface_integration.py deleted file mode 100644 index d8e6fdb4..00000000 --- a/tests/manual/label_studio_interface/test_label_studio_interface_integration.py +++ /dev/null @@ -1,73 +0,0 @@ -import pytest - -from label_studio_interface.DTOs.LabelStudioTaskExportInfo import LabelStudioTaskExportInfo -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager, generate_random_word -from label_studio_interface.LabelStudioConfig import LabelStudioConfig - - -# Setup method -@pytest.fixture -def api_manager() -> LabelStudioAPIManager: - config = LabelStudioConfig() - return LabelStudioAPIManager(config) - -# Helper methods -def get_member_role_and_user_id(user_id: str, org_id: str, data: dict) -> tuple[str, int]: - for result in data['results']: - if result['organization'] == int(org_id) and result['user']['username'] == user_id: - return result['role'], result['user']['id'] - -def test_import_tasks_from_project(api_manager): - response = api_manager.import_tasks_from_project() - print(response.json()) - -def test_export_tasks_into_project(api_manager): - data = [] - for _ in range(10): - data.append( - LabelStudioTaskExportInfo(url=f"https://example.com/{generate_random_word(10)}") - ) - import_id = api_manager.export_tasks_into_project(data) - print("Import ID:", import_id) - - -def test_ping_project(api_manager): - project_accessible = api_manager.ping_project() - assert project_accessible - print("Project is accessible") - - -def test_get_members_in_organization(api_manager): - response = api_manager.get_members_in_organization() - assert response.status_code == 200 - print(response.json()) - -def test_update_member_role(api_manager): - # Note that for this test to work, you need to ensure there is seat available for the user in the organization - # A seat can be made available by deactivating a seat from another user - # (Remember to reassign the seat to the user after the test) - from label_studio_interface.LabelStudioAPIManager import Role - username = 'resibe6343' - response = api_manager.get_members_in_organization() - org_id = api_manager.config.organization_id - role, user_id = get_member_role_and_user_id(username, org_id, response.json()) - print(role) - - # Update role to Annotator - response = api_manager.update_member_role(user_id, Role.ANNOTATOR) - assert response.status_code == 200 - response = api_manager.get_members_in_organization() - role, _ = get_member_role_and_user_id(username, org_id, response.json()) - assert role == Role.ANNOTATOR.value - - # Update role to Deactivated - response = api_manager.update_member_role(user_id, Role.DEACTIVATED) - assert response.status_code == 200 - response = api_manager.get_members_in_organization() - role, _ = get_member_role_and_user_id(username, org_id, response.json()) - assert role == Role.DEACTIVATED.value - - - # response = api_manager.update_member_role("user_id", "role") - # assert response.status_code == 200 - # print(response.json()) \ No newline at end of file From a7f757e829bdbc804c4b47d556b49b979ba6cf62 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 30 Jan 2025 11:30:23 -0500 Subject: [PATCH 032/182] Remove unused files --- html_tag_collector/ResponseFetcher.py | 64 --------------------------- 1 file changed, 64 deletions(-) delete mode 100644 html_tag_collector/ResponseFetcher.py diff --git a/html_tag_collector/ResponseFetcher.py b/html_tag_collector/ResponseFetcher.py deleted file mode 100644 index 04ef3f21..00000000 --- a/html_tag_collector/ResponseFetcher.py +++ /dev/null @@ -1,64 +0,0 @@ -import ssl -import traceback -from dataclasses import dataclass -from typing import Optional - -import requests -import urllib3 -from requests_html import AsyncHTMLSession - -from html_tag_collector.constants import REQUEST_HEADERS -from html_tag_collector.url_adjustment_functions import http_to_https - - -class ResponseFetcher: - - def __init__(self, session: AsyncHTMLSession, url: str, debug=False): - self.headers = REQUEST_HEADERS - self.session = session - self.url = url - self.debug = debug - - def debug_print(self, s: str): - if self.debug: - print(s) - - async def fetch(self, verify_ssl=True): - return await self.session.get( - self.url, - headers=self.headers, - timeout=120, - verify=verify_ssl - ) - - async def get_response(self): - response = None - try: - response = await self.fetch() - except (requests.exceptions.SSLError, ssl.SSLError): - # This error is raised when the website uses a legacy SSL version, which is not supported by requests - self.debug_print(f"SSLError: {self.url}") - - # Retry without SSL verification - response = await self.fetch(verify_ssl=False) - except requests.exceptions.ConnectionError: - # Sometimes this error is raised because the provided url uses http - # when it should be https and the website does not handle it properly - self.debug_print(f"MaxRetryError: {self.url}") - - response = await self.retry_with_https() - except (urllib3.exceptions.LocationParseError, requests.exceptions.ReadTimeout) as e: - self.debug_print(f"{type(e).__name__}: {self.url}") - except Exception as e: - self.debug_print(f""" - "Exception:", {self.url} - {traceback.format_exc()} - {e} - """) - finally: - self.debug_print(f"{self.url} - {str(response)}") - return response - - async def retry_with_https(self): - self.url = http_to_https(self.url) - return await self.fetch() From ecde93ff18cc752ba126d694157a51be7aed2d5b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 30 Jan 2025 11:30:38 -0500 Subject: [PATCH 033/182] Remove unused libraries --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2cc28614..7954fedf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ requests~=2.31.0 -polars~=0.20.10 python-dotenv~=1.0.1 bs4~=0.0.2 tqdm>=4.64.1 @@ -13,7 +12,6 @@ datasets~=2.19.1 huggingface-hub~=0.22.2 # html_tag_collector_only -requests_html>=0.10.0 lxml~=5.1.0 beautifulsoup4>=4.12.3 From 1ab1eab921a74598aedc557833c92918331a9b89 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 30 Jan 2025 11:32:22 -0500 Subject: [PATCH 034/182] Update Dockerfile - refine test copies - set pip install to prefer binary --- Dockerfile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e820fa66..6582e44d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ WORKDIR /app COPY requirements.txt ./requirements.txt # Install dependencies -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir --prefer-binary -r requirements.txt RUN playwright install chromium RUN playwright install-deps @@ -29,7 +29,13 @@ COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager COPY execute.sh ./execute.sh COPY .project-root ./.project-root -COPY tests ./tests + +COPY tests/conftest.py ./tests/conftest.py +COPY tests/__init__.py ./tests/__init__.py +COPY tests/test_automated ./tests/test_automated +COPY tests/test_alembic ./tests/test_alembic +COPY tests/helpers ./tests/helpers + COPY llm_api_logic ./llm_api_logic # Expose the application port From da929dde95671e631de4b00943a15f3422b7b805 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 30 Jan 2025 12:07:23 -0500 Subject: [PATCH 035/182] Update Dockerfile/requirements - Change to Python 3.11.9 - Only install playwright deps for Chromium - Move to tensorflow-cpu --- Dockerfile | 4 ++-- requirements.txt | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6582e44d..fae4de32 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Dockerfile for Source Collector FastAPI app -FROM python:3.12.8-slim +FROM python:3.11.9-slim # Set working directory WORKDIR /app @@ -10,7 +10,7 @@ COPY requirements.txt ./requirements.txt # Install dependencies RUN pip install --no-cache-dir --prefer-binary -r requirements.txt RUN playwright install chromium -RUN playwright install-deps +RUN playwright install-deps chromium # Copy project files COPY agency_identifier ./agency_identifier diff --git a/requirements.txt b/requirements.txt index 7954fedf..b72804e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,9 @@ alembic~=1.14.0 asyncpg~=0.30.0 pytest-asyncio~=0.25.2 transformers~=4.40.2 -tf-keras~=2.18.0 +tensorflow-cpu~=2.15.1 +keras~=2.15.0 + # HTML Collector playwright~=1.49.1 From 34a7ef3d59ffd963add83597547c27d6b6c94bb7 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 31 Jan 2025 09:41:23 -0500 Subject: [PATCH 036/182] Begin draft on record type annotation. --- api/routes/annotate.py | 50 ++++++++++++--- collector_db/AsyncDatabaseClient.py | 19 +++--- collector_db/DTOs/URLAnnotationInfo.py | 3 +- collector_db/StatementComposer.py | 20 +++++- core/AsyncCore.py | 54 +++++++++++----- ...equestInfo.py => AnnotationRequestInfo.py} | 5 +- core/DTOs/GetNextURLForAnnotationResponse.py | 9 +++ ...etNextURLForRelevanceAnnotationResponse.py | 9 --- core/DTOs/RecordTypeAnnotationPostInfo.py | 7 +++ ...Info.py => RelevanceAnnotationPostInfo.py} | 0 .../api/helpers/RequestValidator.py | 30 +++++++-- .../integration/api/test_annotate.py | 63 ++++++++++++++----- 12 files changed, 203 insertions(+), 66 deletions(-) rename core/DTOs/{RelevanceAnnotationRequestInfo.py => AnnotationRequestInfo.py} (57%) create mode 100644 core/DTOs/GetNextURLForAnnotationResponse.py delete mode 100644 core/DTOs/GetNextURLForRelevanceAnnotationResponse.py create mode 100644 core/DTOs/RecordTypeAnnotationPostInfo.py rename core/DTOs/{RelevanceAnnotationInfo.py => RelevanceAnnotationPostInfo.py} (100%) diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 25eab1d3..27b21708 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -1,9 +1,11 @@ from fastapi import APIRouter, Depends, Path from api.dependencies import get_async_core +from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from security_manager.SecurityManager import get_access_info, AccessInfo annotate_router = APIRouter( @@ -17,8 +19,11 @@ async def get_next_url_for_relevance_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForRelevanceAnnotationResponse: - result = await async_core.get_next_url_for_relevance_annotation(user_id=access_info.user_id) +) -> GetNextURLForAnnotationResponse: + result = await async_core.get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_type=URLMetadataAttributeType.RELEVANT + ) return result @@ -28,14 +33,43 @@ async def annotate_url_for_relevance_and_get_next_url( metadata_id: int = Path(description="The metadata id for the associated URL metadata"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) -) -> GetNextURLForRelevanceAnnotationResponse: +) -> GetNextURLForAnnotationResponse: + """ + Post URL annotation and get next URL to annotate + """ + result = await async_core.submit_and_get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_id=metadata_id, + annotation=str(relevance_annotation_post_info.is_relevant), + metadata_type = URLMetadataAttributeType.RELEVANT + ) + return result + +@annotate_router.get("/record-type") +async def get_next_url_for_record_type_annotation( + access_info: AccessInfo = Depends(get_access_info), + async_core: AsyncCore = Depends(get_async_core), +) -> GetNextURLForAnnotationResponse: + result = await async_core.get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_type=URLMetadataAttributeType.RECORD_TYPE + ) + return result + +@annotate_router.post("/record-type/{metadata_id}") +async def annotate_url_for_record_type_and_get_next_url( + record_type_annotation_post_info: RecordTypeAnnotationPostInfo, + metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +) -> GetNextURLForAnnotationResponse: """ Post URL annotation and get next URL to annotate """ - await async_core.submit_url_relevance_annotation( + result = await async_core.submit_and_get_next_url_for_annotation( user_id=access_info.user_id, metadata_id=metadata_id, - annotation=relevance_annotation_post_info + annotation=record_type_annotation_post_info.record_type.value, + metadata_type=URLMetadataAttributeType.RECORD_TYPE ) - result = await async_core.get_next_url_for_relevance_annotation(user_id=access_info.user_id) return result diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 07f1cc10..04d40a82 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -23,7 +23,7 @@ from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import BatchStatus @@ -232,10 +232,11 @@ async def update_url_metadata_status(self, session: AsyncSession, metadata_ids: url_metadata.validation_status = validation_status @session_manager - async def get_next_url_for_relevance_annotation( + async def get_next_url_for_annotation( self, session: AsyncSession, - user_id: int + user_id: int, + metadata_type: URLMetadataAttributeType ) -> URLAnnotationInfo: # Get a URL, its relevancy metadata ID, and HTML data # For a URL which has not yet been annotated by this user id @@ -246,10 +247,11 @@ async def get_next_url_for_relevance_annotation( URL.id.label("url_id"), URL.url, URLMetadata.id.label("metadata_id"), + URLMetadata.value, ) .join(URLMetadata) # Metadata must be relevant - .where(URLMetadata.attribute == URLMetadataAttributeType.RELEVANT.value) + .where(URLMetadata.attribute == metadata_type.value) # Metadata must not be validated .where(URLMetadata.validation_status == ValidationStatus.PENDING_VALIDATION.value) # URL must have HTML content entries @@ -274,6 +276,7 @@ async def get_next_url_for_relevance_annotation( select( subquery.c.url, subquery.c.metadata_id, + subquery.c.value, URLHTMLContent.content_type, URLHTMLContent.content, ) @@ -291,9 +294,10 @@ async def get_next_url_for_relevance_annotation( annotation_info = URLAnnotationInfo( url=result[0][0], metadata_id=result[0][1], + suggested_value=result[0][2], html_infos=[] ) - for _, _, content_type, content in result: + for _, _, _, content_type, content in result: html_info = URLHTMLContentInfo( content_type=content_type, content=content @@ -307,11 +311,12 @@ async def add_relevance_annotation( session: AsyncSession, user_id: int, metadata_id: int, - annotation_info: RelevanceAnnotationPostInfo): + annotation: str + ): annotation = MetadataAnnotation( metadata_id=metadata_id, user_id=user_id, - value=str(annotation_info.is_relevant) + value=annotation ) session.add(annotation) diff --git a/collector_db/DTOs/URLAnnotationInfo.py b/collector_db/DTOs/URLAnnotationInfo.py index 54792dfc..844b226d 100644 --- a/collector_db/DTOs/URLAnnotationInfo.py +++ b/collector_db/DTOs/URLAnnotationInfo.py @@ -6,4 +6,5 @@ class URLAnnotationInfo(BaseModel): metadata_id: int url: str - html_infos: list[URLHTMLContentInfo] \ No newline at end of file + html_infos: list[URLHTMLContentInfo] + suggested_value: str \ No newline at end of file diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index dc756fb3..c042e10c 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,8 +1,8 @@ from sqlalchemy import Select, select, exists, Table, func, Subquery -from collector_db.enums import URLMetadataAttributeType -from collector_db.models import URL, URLHTMLContent, URLMetadata +from collector_db.enums import URLMetadataAttributeType, ValidationStatus +from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation from collector_manager.enums import URLStatus @@ -33,6 +33,22 @@ def exclude_urls_with_select_metadata( ) )) + @staticmethod + def exclude_url_annotated_by_user( + statement: Select, + user_id: int + ) -> Select: + return (statement.where( + ~exists( + select(MetadataAnnotation.id). + where( + MetadataAnnotation.metadata_id == URLMetadata.id, + MetadataAnnotation.user_id == user_id + ) + ) + )) + + @staticmethod def simple_count_subquery(model, attribute: str, label: str) -> Subquery: attr_value = getattr(model, attribute) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index afa5c7ab..6ab9fcf5 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -3,12 +3,11 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo -from collector_db.enums import TaskType -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from collector_db.enums import TaskType, URLMetadataAttributeType +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo -from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo +from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator @@ -66,39 +65,62 @@ async def run_tasks(self): await self.run_url_relevance_huggingface_task() await self.run_url_record_type_task() - async def convert_to_relevance_annotation_request_info(self, url_info: URLAnnotationInfo) -> RelevanceAnnotationRequestInfo: + async def convert_to_annotation_request_info(self, url_info: URLAnnotationInfo) -> AnnotationRequestInfo: response_html_info = convert_to_response_html_info( html_content_infos=url_info.html_infos ) - return RelevanceAnnotationRequestInfo( + return AnnotationRequestInfo( url=url_info.url, metadata_id=url_info.metadata_id, - html_info=response_html_info + html_info=response_html_info, + suggested_value=url_info.suggested_value ) - async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextURLForRelevanceAnnotationResponse: - response = GetNextURLForRelevanceAnnotationResponse() - ua_info: URLAnnotationInfo = await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + async def get_next_url_for_annotation(self, user_id: int, metadata_type: URLMetadataAttributeType) -> GetNextURLForAnnotationResponse: + response = GetNextURLForAnnotationResponse() + ua_info: URLAnnotationInfo = await self.adb_client.get_next_url_for_annotation( + user_id=user_id, + metadata_type=metadata_type + ) if ua_info is None: return response # Format result - result = await self.convert_to_relevance_annotation_request_info(url_info=ua_info) + result = await self.convert_to_annotation_request_info(url_info=ua_info) response.next_annotation = result return response + async def submit_and_get_next_url_for_annotation( + self, + user_id: int, + metadata_id: int, + annotation: str, + metadata_type: URLMetadataAttributeType + ) -> GetNextURLForAnnotationResponse: + await self.submit_url_annotation( + user_id=user_id, + metadata_id=metadata_id, + annotation=annotation, + metadata_type=metadata_type + ) + result = await self.get_next_url_for_annotation( + user_id=user_id, + metadata_type=metadata_type + ) + return result - async def submit_url_relevance_annotation( + async def submit_url_annotation( self, user_id: int, metadata_id: int, - annotation: RelevanceAnnotationPostInfo - ) -> GetNextURLForRelevanceAnnotationResponse: + annotation: str, + metadata_type: URLMetadataAttributeType + ) -> GetNextURLForAnnotationResponse: await self.adb_client.add_relevance_annotation( user_id=user_id, metadata_id=metadata_id, - annotation_info=annotation) - return await self.get_next_url_for_relevance_annotation(user_id=user_id) + annotation=annotation) + return await self.get_next_url_for_annotation(user_id=user_id, metadata_type=metadata_type) async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: return await self.adb_client.get_urls(page=page, errors=errors) diff --git a/core/DTOs/RelevanceAnnotationRequestInfo.py b/core/DTOs/AnnotationRequestInfo.py similarity index 57% rename from core/DTOs/RelevanceAnnotationRequestInfo.py rename to core/DTOs/AnnotationRequestInfo.py index de4036db..1e886ae8 100644 --- a/core/DTOs/RelevanceAnnotationRequestInfo.py +++ b/core/DTOs/AnnotationRequestInfo.py @@ -3,7 +3,8 @@ from html_tag_collector.DataClassTags import ResponseHTMLInfo -class RelevanceAnnotationRequestInfo(BaseModel): +class AnnotationRequestInfo(BaseModel): url: str metadata_id: int - html_info: ResponseHTMLInfo \ No newline at end of file + html_info: ResponseHTMLInfo + suggested_value: str \ No newline at end of file diff --git a/core/DTOs/GetNextURLForAnnotationResponse.py b/core/DTOs/GetNextURLForAnnotationResponse.py new file mode 100644 index 00000000..b4bc1087 --- /dev/null +++ b/core/DTOs/GetNextURLForAnnotationResponse.py @@ -0,0 +1,9 @@ +from typing import Optional + +from pydantic import BaseModel + +from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo + + +class GetNextURLForAnnotationResponse(BaseModel): + next_annotation: Optional[AnnotationRequestInfo] = None diff --git a/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py b/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py deleted file mode 100644 index a58a4565..00000000 --- a/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo - - -class GetNextURLForRelevanceAnnotationResponse(BaseModel): - next_annotation: Optional[RelevanceAnnotationRequestInfo] = None diff --git a/core/DTOs/RecordTypeAnnotationPostInfo.py b/core/DTOs/RecordTypeAnnotationPostInfo.py new file mode 100644 index 00000000..87e8b674 --- /dev/null +++ b/core/DTOs/RecordTypeAnnotationPostInfo.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + +from core.enums import RecordType + + +class RecordTypeAnnotationPostInfo(BaseModel): + record_type: RecordType \ No newline at end of file diff --git a/core/DTOs/RelevanceAnnotationInfo.py b/core/DTOs/RelevanceAnnotationPostInfo.py similarity index 100% rename from core/DTOs/RelevanceAnnotationInfo.py rename to core/DTOs/RelevanceAnnotationPostInfo.py diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 220b6645..5ff2f239 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,13 +12,14 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.MessageCountResponse import MessageCountResponse from core.DTOs.MessageResponse import MessageResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import BatchStatus from util.helper_functions import update_if_not_none @@ -175,22 +176,39 @@ def process_relevancy(self) -> MessageCountResponse: ) return MessageCountResponse(**data) - def get_next_relevance_annotation(self) -> GetNextURLForRelevanceAnnotationResponse: + def get_next_relevance_annotation(self) -> GetNextURLForAnnotationResponse: data = self.get( url=f"/annotate/relevance" ) - return GetNextURLForRelevanceAnnotationResponse(**data) + return GetNextURLForAnnotationResponse(**data) + + def get_next_record_type_annotation(self) -> GetNextURLForAnnotationResponse: + data = self.get( + url=f"/annotate/record-type" + ) + return GetNextURLForAnnotationResponse(**data) + + def post_record_type_annotation_and_get_next( + self, + metadata_id: int, + record_type_annotation_post_info: RecordTypeAnnotationPostInfo + ) -> GetNextURLForAnnotationResponse: + data = self.post( + url=f"/annotate/record-type/{metadata_id}", + json=record_type_annotation_post_info.model_dump() + ) + return GetNextURLForAnnotationResponse(**data) def post_relevance_annotation_and_get_next( self, metadata_id: int, relevance_annotation_post_info: RelevanceAnnotationPostInfo - ) -> GetNextURLForRelevanceAnnotationResponse: + ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/relevance/{metadata_id}", json=relevance_annotation_post_info.model_dump() ) - return GetNextURLForRelevanceAnnotationResponse(**data) + return GetNextURLForAnnotationResponse(**data) def get_urls(self, page: int = 1, errors: bool = False) -> GetURLsResponseInfo: data = self.get( diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 5b8730cf..899c7f28 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -1,14 +1,23 @@ +from typing import Any + import pytest from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.enums import RecordType from tests.test_automated.integration.api.conftest import MOCK_USER_ID - -@pytest.mark.asyncio -async def test_annotate(api_test_helper): +async def run_annotation_test( + api_test_helper, + submit_and_get_next_function: callable, + get_next_function: callable, + post_info: Any, + metadata_attribute: URLMetadataAttributeType, + expected_metadata_value: str +): ath = api_test_helper # Create batch with status `in-process` and strategy `example` @@ -20,7 +29,7 @@ async def test_annotate(api_test_helper): url_2 = iui.url_mappings[1] kwargs = { - "attribute": URLMetadataAttributeType.RELEVANT, + "attribute": metadata_attribute, "validation_status": ValidationStatus.PENDING_VALIDATION, "validation_source": ValidationSource.MACHINE_LEARNING } @@ -39,20 +48,18 @@ async def test_annotate(api_test_helper): # Add HTML data to both await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) # Call `GET` `/annotate/url` and receive next URL - request_info_1: GetNextURLForRelevanceAnnotationResponse = ath.request_validator.get_next_relevance_annotation() + request_info_1: GetNextURLForAnnotationResponse = get_next_function() inner_info_1 = request_info_1.next_annotation # Validate presence of HTML data in `html` field assert inner_info_1.html_info.description != "" assert inner_info_1.html_info.title != "" + assert inner_info_1.suggested_value == "False" - post_info = RelevanceAnnotationPostInfo( - is_relevant=True - ) # Call `POST` `/annotate/url` with finished annotation, and receive next URL - request_info_2 = ath.request_validator.post_relevance_annotation_and_get_next( - metadata_id=inner_info_1.metadata_id, - relevance_annotation_post_info=post_info + request_info_2 = submit_and_get_next_function( + inner_info_1.metadata_id, + post_info ) inner_info_2 = request_info_2.next_annotation # Confirm 2nd URL is distinct from 1st @@ -68,7 +75,7 @@ async def test_annotate(api_test_helper): ) assert len(results) == 1 assert results[0].user_id == MOCK_USER_ID - assert results[0].value == "True" + assert results[0].value == expected_metadata_value # Submit this one in turn, and no subsequent annotation info should be returned request_info_3 = ath.request_validator.post_relevance_annotation_and_get_next( @@ -76,4 +83,30 @@ async def test_annotate(api_test_helper): relevance_annotation_post_info=post_info ) - assert request_info_3.next_annotation is None \ No newline at end of file + assert request_info_3.next_annotation is None + +@pytest.mark.asyncio +async def test_annotate_relevancy(api_test_helper): + await run_annotation_test( + api_test_helper=api_test_helper, + submit_and_get_next_function=api_test_helper.request_validator.post_relevance_annotation_and_get_next, + get_next_function=api_test_helper.request_validator.get_next_relevance_annotation, + post_info=RelevanceAnnotationPostInfo( + is_relevant=True + ), + metadata_attribute=URLMetadataAttributeType.RELEVANT, + expected_metadata_value="True" + ) + +@pytest.mark.asyncio +async def test_annotate_record_type(api_test_helper): + await run_annotation_test( + api_test_helper=api_test_helper, + submit_and_get_next_function=api_test_helper.request_validator.post_record_type_annotation_and_get_next, + get_next_function=api_test_helper.request_validator.get_next_record_type_annotation, + post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ACCIDENT_REPORTS + ), + metadata_attribute=URLMetadataAttributeType.RECORD_TYPE, + expected_metadata_value=RecordType.ACCIDENT_REPORTS.value + ) From d6efb62a35210d4de9a6e530b994d07def5aa33e Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 31 Jan 2025 11:42:12 -0500 Subject: [PATCH 037/182] Draft work --- .../integration/api/helpers/RequestValidator.py | 4 ++-- tests/test_automated/integration/api/test_annotate.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 5ff2f239..d3e60e1d 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -195,7 +195,7 @@ def post_record_type_annotation_and_get_next( ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/record-type/{metadata_id}", - json=record_type_annotation_post_info.model_dump() + json=record_type_annotation_post_info.model_dump(mode='json') ) return GetNextURLForAnnotationResponse(**data) @@ -206,7 +206,7 @@ def post_relevance_annotation_and_get_next( ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/relevance/{metadata_id}", - json=relevance_annotation_post_info.model_dump() + json=relevance_annotation_post_info.model_dump(mode='json') ) return GetNextURLForAnnotationResponse(**data) diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 899c7f28..1ee03963 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -78,9 +78,9 @@ async def run_annotation_test( assert results[0].value == expected_metadata_value # Submit this one in turn, and no subsequent annotation info should be returned - request_info_3 = ath.request_validator.post_relevance_annotation_and_get_next( - metadata_id=inner_info_2.metadata_id, - relevance_annotation_post_info=post_info + request_info_3 = submit_and_get_next_function( + inner_info_2.metadata_id, + post_info ) assert request_info_3.next_annotation is None From 88df8d585530f879319b9e077dad3d944e0760f1 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 31 Jan 2025 11:45:34 -0500 Subject: [PATCH 038/182] Update container python version --- .github/workflows/test_app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index c83608ac..e16d1771 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -21,7 +21,7 @@ jobs: container-job: runs-on: ubuntu-latest timeout-minutes: 20 - container: python:3.12.8 + container: python:3.11.9 services: postgres: From 14e3d66ad8191ef6ce2a1e6158a74e044a8db3dc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 2 Feb 2025 10:46:01 -0500 Subject: [PATCH 039/182] Create new table: `url_agency_suggestions` --- ...19bf57df581a_add_url_agency_suggestions.py | 40 +++++++++++++++++++ collector_db/models.py | 36 +++++++++++++++-- tests/test_alembic/test_revisions.py | 12 +++++- 3 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py diff --git a/collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py b/collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py new file mode 100644 index 00000000..56d47427 --- /dev/null +++ b/collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py @@ -0,0 +1,40 @@ +"""Add url_agency_suggestions + +Revision ID: 19bf57df581a +Revises: 072b32a45b1c +Create Date: 2025-02-02 10:33:02.029875 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from collector_db.enums import PGEnum +# revision identifiers, used by Alembic. +revision: str = '19bf57df581a' +down_revision: Union[str, None] = '072b32a45b1c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +suggestion_type_enum = PGEnum('Suggestion', 'Unknown', 'New Agency', 'Confirmed', name='url_agency_suggestion_type') + +def upgrade() -> None: + op.create_table('url_agency_suggestions', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.Column('suggestion_type', suggestion_type_enum, nullable=False), + sa.Column('agency_id', sa.Integer(), nullable=True), + sa.Column('agency_name', sa.String(), nullable=False), + sa.Column('state', sa.String(), nullable=True), + sa.Column('county', sa.String(), nullable=True), + sa.Column('locality', sa.String(), nullable=True), + sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ), + sa.PrimaryKeyConstraint('id') + ) + + +def downgrade() -> None: + op.drop_table('url_agency_suggestions') + suggestion_type_enum.drop(op.get_bind(), checkfirst=True) diff --git a/collector_db/models.py b/collector_db/models.py index aa33d41e..6ca04846 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -90,6 +90,7 @@ class URL(Base): secondary="link_task_urls", back_populates="urls", ) + agency_suggestions = relationship("URLAgencySuggestion", back_populates="url", cascade="all, delete-orphan") # URL Metadata table definition @@ -98,11 +99,11 @@ class URLMetadata(Base): __table_args__ = (UniqueConstraint( "url_id", "attribute", - name="model_num2_key"), + name="uq_url_id_attribute"), ) id = Column(Integer, primary_key=True) - url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) attribute = Column( PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), nullable=False) @@ -143,7 +144,7 @@ class MetadataAnnotation(Base): url_metadata = relationship("URLMetadata", back_populates="annotations") class RootURL(Base): - __tablename__ = 'root_urls' + __tablename__ = 'root_url_cache' __table_args__ = ( UniqueConstraint( "url", @@ -276,6 +277,8 @@ class LinkTaskURL(Base): task_id = Column(Integer, ForeignKey('tasks.id', ondelete="CASCADE"), primary_key=True) url_id = Column(Integer, ForeignKey('urls.id', ondelete="CASCADE"), primary_key=True) + + class TaskError(Base): __tablename__ = 'task_errors' @@ -291,4 +294,29 @@ class TaskError(Base): "task_id", "error", name="uq_task_id_error"), - ) \ No newline at end of file + ) + +class URLAgencySuggestion(Base): + __tablename__ = 'url_agency_suggestions' + + id = Column(Integer, primary_key=True) + url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + suggestion_type = Column( + PGEnum( + 'Suggestion', + 'Unknown', + 'New Agency', + 'Confirmed', + name='url_agency_suggestion_type' + ), + nullable=False + ) + agency_id = Column(Integer, nullable=True) + agency_name = Column(String, nullable=False) + state = Column(String, nullable=True) + county = Column(String, nullable=True) + locality = Column(String, nullable=True) + updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + + # Relationships + url = relationship("URL", back_populates="agency_suggestions") \ No newline at end of file diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 22a83496..e94f4180 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -334,4 +334,14 @@ def test_add_task_tables_and_linking_logic(alembic_runner): alembic_runner, table_name="url_metadata", columns_to_check=["notes"], - ) \ No newline at end of file + ) + +def test_add_url_agency_suggestions(alembic_runner): + table_creation_check( + alembic_runner, + tables=[ + "url_agency_suggestions" + ], + start_revision="072b32a45b1c", + end_revision="19bf57df581a" + ) From f73c084c7fd4e7eed55941fd9cd89e13cfce7baf Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 4 Feb 2025 19:50:23 -0500 Subject: [PATCH 040/182] Move alembic to standalone directory. --- alembic.ini | 2 +- {collector_db/alembic => alembic}/README.md | 0 {collector_db/alembic => alembic}/env.py | 0 {collector_db/alembic => alembic}/script.py.mako | 0 .../072b32a45b1c_add_task_tables_and_linking_logic.py | 0 .../108dac321086_update_metadata_validation_status.py | 0 .../versions/19bf57df581a_add_url_agency_suggestions.py | 8 +++++++- ...fa_create_url_error_info_table_and_url_error_status.py | 0 .../versions/86692fc1d862_add_url_metadata_table.py | 0 .../9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py | 0 .../versions/a4750e7ff8e7_add_updated_at_to_url_table.py | 0 .../versions/d11f07224d1f_initial_creation.py | 0 .../versions/dae00e5aa8dd_create_rooturlcache.py | 0 ...db6d60feda7d_convert_batch_strategy_status_to_enums.py | 0 .../dcd158092de0_create_metadata_annotation_table.py | 0 .../versions/e27c5f8409a3_convert_url_outcome_to_enum.py | 0 16 files changed, 8 insertions(+), 2 deletions(-) rename {collector_db/alembic => alembic}/README.md (100%) rename {collector_db/alembic => alembic}/env.py (100%) rename {collector_db/alembic => alembic}/script.py.mako (100%) rename {collector_db/alembic => alembic}/versions/072b32a45b1c_add_task_tables_and_linking_logic.py (100%) rename {collector_db/alembic => alembic}/versions/108dac321086_update_metadata_validation_status.py (100%) rename {collector_db/alembic => alembic}/versions/19bf57df581a_add_url_agency_suggestions.py (89%) rename {collector_db/alembic => alembic}/versions/5a5ca06f36fa_create_url_error_info_table_and_url_error_status.py (100%) rename {collector_db/alembic => alembic}/versions/86692fc1d862_add_url_metadata_table.py (100%) rename {collector_db/alembic => alembic}/versions/9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py (100%) rename {collector_db/alembic => alembic}/versions/a4750e7ff8e7_add_updated_at_to_url_table.py (100%) rename {collector_db/alembic => alembic}/versions/d11f07224d1f_initial_creation.py (100%) rename {collector_db/alembic => alembic}/versions/dae00e5aa8dd_create_rooturlcache.py (100%) rename {collector_db/alembic => alembic}/versions/db6d60feda7d_convert_batch_strategy_status_to_enums.py (100%) rename {collector_db/alembic => alembic}/versions/dcd158092de0_create_metadata_annotation_table.py (100%) rename {collector_db/alembic => alembic}/versions/e27c5f8409a3_convert_url_outcome_to_enum.py (100%) diff --git a/alembic.ini b/alembic.ini index 7cc1a0d5..9daecaa2 100644 --- a/alembic.ini +++ b/alembic.ini @@ -3,7 +3,7 @@ [alembic] # path to migration scripts # Use forward slashes (/) also on windows to provide an os agnostic path -script_location = collector_db/alembic +script_location = alembic # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s # Uncomment the line below if you want the files to be prepended with date and time diff --git a/collector_db/alembic/README.md b/alembic/README.md similarity index 100% rename from collector_db/alembic/README.md rename to alembic/README.md diff --git a/collector_db/alembic/env.py b/alembic/env.py similarity index 100% rename from collector_db/alembic/env.py rename to alembic/env.py diff --git a/collector_db/alembic/script.py.mako b/alembic/script.py.mako similarity index 100% rename from collector_db/alembic/script.py.mako rename to alembic/script.py.mako diff --git a/collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py b/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py similarity index 100% rename from collector_db/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py rename to alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py diff --git a/collector_db/alembic/versions/108dac321086_update_metadata_validation_status.py b/alembic/versions/108dac321086_update_metadata_validation_status.py similarity index 100% rename from collector_db/alembic/versions/108dac321086_update_metadata_validation_status.py rename to alembic/versions/108dac321086_update_metadata_validation_status.py diff --git a/collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py b/alembic/versions/19bf57df581a_add_url_agency_suggestions.py similarity index 89% rename from collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py rename to alembic/versions/19bf57df581a_add_url_agency_suggestions.py index 56d47427..aca55c13 100644 --- a/collector_db/alembic/versions/19bf57df581a_add_url_agency_suggestions.py +++ b/alembic/versions/19bf57df581a_add_url_agency_suggestions.py @@ -17,7 +17,13 @@ depends_on: Union[str, Sequence[str], None] = None -suggestion_type_enum = PGEnum('Suggestion', 'Unknown', 'New Agency', 'Confirmed', name='url_agency_suggestion_type') +suggestion_type_enum = PGEnum( + 'Auto Suggestion', + 'Manual Suggestion', + 'Unknown', + 'New Agency', + 'Confirmed', name='url_agency_suggestion_type' +) def upgrade() -> None: op.create_table('url_agency_suggestions', diff --git a/collector_db/alembic/versions/5a5ca06f36fa_create_url_error_info_table_and_url_error_status.py b/alembic/versions/5a5ca06f36fa_create_url_error_info_table_and_url_error_status.py similarity index 100% rename from collector_db/alembic/versions/5a5ca06f36fa_create_url_error_info_table_and_url_error_status.py rename to alembic/versions/5a5ca06f36fa_create_url_error_info_table_and_url_error_status.py diff --git a/collector_db/alembic/versions/86692fc1d862_add_url_metadata_table.py b/alembic/versions/86692fc1d862_add_url_metadata_table.py similarity index 100% rename from collector_db/alembic/versions/86692fc1d862_add_url_metadata_table.py rename to alembic/versions/86692fc1d862_add_url_metadata_table.py diff --git a/collector_db/alembic/versions/9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py b/alembic/versions/9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py similarity index 100% rename from collector_db/alembic/versions/9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py rename to alembic/versions/9afd8a5633c9_create_htmlcontent_and_rooturl_tables.py diff --git a/collector_db/alembic/versions/a4750e7ff8e7_add_updated_at_to_url_table.py b/alembic/versions/a4750e7ff8e7_add_updated_at_to_url_table.py similarity index 100% rename from collector_db/alembic/versions/a4750e7ff8e7_add_updated_at_to_url_table.py rename to alembic/versions/a4750e7ff8e7_add_updated_at_to_url_table.py diff --git a/collector_db/alembic/versions/d11f07224d1f_initial_creation.py b/alembic/versions/d11f07224d1f_initial_creation.py similarity index 100% rename from collector_db/alembic/versions/d11f07224d1f_initial_creation.py rename to alembic/versions/d11f07224d1f_initial_creation.py diff --git a/collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py b/alembic/versions/dae00e5aa8dd_create_rooturlcache.py similarity index 100% rename from collector_db/alembic/versions/dae00e5aa8dd_create_rooturlcache.py rename to alembic/versions/dae00e5aa8dd_create_rooturlcache.py diff --git a/collector_db/alembic/versions/db6d60feda7d_convert_batch_strategy_status_to_enums.py b/alembic/versions/db6d60feda7d_convert_batch_strategy_status_to_enums.py similarity index 100% rename from collector_db/alembic/versions/db6d60feda7d_convert_batch_strategy_status_to_enums.py rename to alembic/versions/db6d60feda7d_convert_batch_strategy_status_to_enums.py diff --git a/collector_db/alembic/versions/dcd158092de0_create_metadata_annotation_table.py b/alembic/versions/dcd158092de0_create_metadata_annotation_table.py similarity index 100% rename from collector_db/alembic/versions/dcd158092de0_create_metadata_annotation_table.py rename to alembic/versions/dcd158092de0_create_metadata_annotation_table.py diff --git a/collector_db/alembic/versions/e27c5f8409a3_convert_url_outcome_to_enum.py b/alembic/versions/e27c5f8409a3_convert_url_outcome_to_enum.py similarity index 100% rename from collector_db/alembic/versions/e27c5f8409a3_convert_url_outcome_to_enum.py rename to alembic/versions/e27c5f8409a3_convert_url_outcome_to_enum.py From a6dcd00903718140e169ea808397d8f4bf5b554e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 5 Feb 2025 09:48:30 -0500 Subject: [PATCH 041/182] Add logic for Agency Identification. --- ENV.md | 35 +-- agency_identifier/MuckrockAPIInterface.py | 51 ++++ ...19bf57df581a_add_url_agency_suggestions.py | 25 +- collector_db/AsyncDatabaseClient.py | 61 ++++- collector_db/StatementComposer.py | 10 +- collector_db/enums.py | 1 + collector_db/models.py | 19 +- core/DTOs/URLAgencySuggestionInfo.py | 15 ++ .../AgencyIdentificationTDO.py | 11 + .../AgencyIdentificationTaskOperator.py | 92 +++++++ .../AgencyIdentificationSubtaskBase.py | 16 ++ .../AutoGooglerAgencyIdentificationSubtask.py | 25 ++ .../CKANAgencyIdentificationSubtask.py | 29 ++ ...ommonCrawlerAgencyIdentificationSubtask.py | 23 ++ .../MuckrockAgencyIdentificationSubtask.py | 42 +++ core/classes/subtasks/__init__.py | 0 core/enums.py | 8 + core/exceptions.py | 8 + core/helpers.py | 48 ++++ html_tag_collector/URLRequestInterface.py | 7 - pdap_api_client/AccessManager.py | 104 ++++--- pdap_api_client/DTOs.py | 24 +- pdap_api_client/PDAPClient.py | 45 +++- pdap_api_client/enums.py | 7 + requirements.txt | 1 + tests/helpers/DBDataCreator.py | 26 +- tests/manual/agency_identifier/__init__.py | 0 .../test_muckrock_api_interface.py | 16 ++ tests/manual/pdap_client/__init__.py | 0 .../manual/pdap_client/test_access_manager.py | 23 ++ tests/manual/pdap_client/test_pdap_client.py | 23 ++ .../api/helpers/RequestValidator.py | 4 +- .../integration/api/test_annotate.py | 6 +- .../tasks/test_agency_preannotation_task.py | 255 ++++++++++++++++++ util/helper_functions.py | 4 +- 35 files changed, 977 insertions(+), 87 deletions(-) create mode 100644 agency_identifier/MuckrockAPIInterface.py create mode 100644 core/DTOs/URLAgencySuggestionInfo.py create mode 100644 core/DTOs/task_data_objects/AgencyIdentificationTDO.py create mode 100644 core/classes/AgencyIdentificationTaskOperator.py create mode 100644 core/classes/subtasks/AgencyIdentificationSubtaskBase.py create mode 100644 core/classes/subtasks/AutoGooglerAgencyIdentificationSubtask.py create mode 100644 core/classes/subtasks/CKANAgencyIdentificationSubtask.py create mode 100644 core/classes/subtasks/CommonCrawlerAgencyIdentificationSubtask.py create mode 100644 core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py create mode 100644 core/classes/subtasks/__init__.py create mode 100644 pdap_api_client/enums.py create mode 100644 tests/manual/agency_identifier/__init__.py create mode 100644 tests/manual/agency_identifier/test_muckrock_api_interface.py create mode 100644 tests/manual/pdap_client/__init__.py create mode 100644 tests/manual/pdap_client/test_access_manager.py create mode 100644 tests/manual/pdap_client/test_pdap_client.py create mode 100644 tests/test_automated/integration/tasks/test_agency_preannotation_task.py diff --git a/ENV.md b/ENV.md index 943ad293..8fd30c33 100644 --- a/ENV.md +++ b/ENV.md @@ -2,18 +2,23 @@ This page provides a full list, with description, of all the environment variabl Please ensure these are properly defined in a `.env` file in the root directory. -| Name | Description | Example | -|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| `LABEL_STUDIO_ACCESS_TOKEN` | The access token for the Label Studio API. The access token for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. | `abc123` | -| `LABEL_STUDIO_PROJECT_ID` | The project ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL, as in `https://app.heartex.com/projects/58475/` | `58475` | -| `LABEL_STUDIO_ORGANIZATION_ID` | The organization ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [Organization section](https://app.heartex.com/organization?page=1), where the organization ID can be copied. | `6758` | -| `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | -| `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | -|`POSTGRES_USER` | The username for the test database | `test_source_collector_user` | -|`POSTGRES_PASSWORD` | The password for the test database | `HanviliciousHamiltonHilltops` | -|`POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | -|`POSTGRES_HOST` | The host for the test database | `127.0.0.1` | -|`POSTGRES_PORT` | The port for the test database | `5432` | -|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token that is used in the Data Sources App for encoding. |`abc123`| -|`DEV`| Set to any value to run the application in development mode. |`true`| -|'DEEPSEEK_API_KEY'| The API key required for accessing the DeepSeek API. |`abc123`| +| Name | Description | Example | +|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| +| `LABEL_STUDIO_ACCESS_TOKEN` | The access token for the Label Studio API. The access token for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. | `abc123` | +| `LABEL_STUDIO_PROJECT_ID` | The project ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL, as in `https://app.heartex.com/projects/58475/` | `58475` | +| `LABEL_STUDIO_ORGANIZATION_ID` | The organization ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [Organization section](https://app.heartex.com/organization?page=1), where the organization ID can be copied. | `6758` | +| `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | +| `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | +|`POSTGRES_USER` | The username for the test database | `test_source_collector_user` | +|`POSTGRES_PASSWORD` | The password for the test database | `HanviliciousHamiltonHilltops` | +|`POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | +|`POSTGRES_HOST` | The host for the test database | `127.0.0.1` | +|`POSTGRES_PORT` | The port for the test database | `5432` | +|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token that is used in the Data Sources App for encoding. | `abc123` | +|`DEV`| Set to any value to run the application in development mode. | `true` | +|`DEEPSEEK_API_KEY`| The API key required for accessing the DeepSeek API. | `abc123` | +|`OPENAI_API_KEY`| The API key required for accessing the OpenAI API.| `abc123` | +|`PDAP_EMAIL`| An email address for accessing the PDAP API.| `abc123@test.com` | +|`PDAP_PASSWORD`| A password for accessing the PDAP API.| `abc123` | +|`PDAP_API_KEY`| An API key for accessing the PDAP API.| `abc123` | + diff --git a/agency_identifier/MuckrockAPIInterface.py b/agency_identifier/MuckrockAPIInterface.py new file mode 100644 index 00000000..bbc56ee7 --- /dev/null +++ b/agency_identifier/MuckrockAPIInterface.py @@ -0,0 +1,51 @@ +from enum import Enum +from typing import Optional + +import requests +from aiohttp import ClientSession +from pydantic import BaseModel + + +class AgencyLookupResponseType(Enum): + FOUND = "found" + NOT_FOUND = "not_found" + ERROR = "error" + +class AgencyLookupResponse(BaseModel): + name: Optional[str] + type: AgencyLookupResponseType + error: Optional[str] = None + + + +class MuckrockAPIInterface: + + def __init__(self, session: ClientSession): + self.base_url = "https://www.muckrock.com/api_v1/" + self.session = session + + def build_url(self, subpath: str): + return f"{self.base_url}{subpath}" + + + async def lookup_agency(self, muckrock_agency_id: int) -> AgencyLookupResponse: + url = self.build_url(f"agency/{muckrock_agency_id}") + try: + async with self.session.get(url) as results: + results.raise_for_status() + json = await results.json() + name = json["name"] + return AgencyLookupResponse( + name=name, type=AgencyLookupResponseType.FOUND + ) + except requests.exceptions.HTTPError as e: + return AgencyLookupResponse( + name=None, + type=AgencyLookupResponseType.ERROR, + error=str(e) + ) + except KeyError: + return AgencyLookupResponse( + name=None, type=AgencyLookupResponseType.NOT_FOUND + ) + diff --git a/alembic/versions/19bf57df581a_add_url_agency_suggestions.py b/alembic/versions/19bf57df581a_add_url_agency_suggestions.py index aca55c13..608fcd1b 100644 --- a/alembic/versions/19bf57df581a_add_url_agency_suggestions.py +++ b/alembic/versions/19bf57df581a_add_url_agency_suggestions.py @@ -25,13 +25,31 @@ 'Confirmed', name='url_agency_suggestion_type' ) +old_task_options = ( + 'HTML', + 'Relevancy', + 'Record Type', +) +new_task_options = old_task_options + ('Agency Identification',) + +old_task_type_enum = PGEnum( + *old_task_options, + name='task_type_old' +) + +new_task_type_enum = PGEnum( + *new_task_options, + name='task_type' +) + def upgrade() -> None: + op.execute("ALTER TYPE task_type ADD VALUE 'Agency Identification';") op.create_table('url_agency_suggestions', sa.Column('id', sa.Integer(), nullable=False), sa.Column('url_id', sa.Integer(), nullable=False), sa.Column('suggestion_type', suggestion_type_enum, nullable=False), sa.Column('agency_id', sa.Integer(), nullable=True), - sa.Column('agency_name', sa.String(), nullable=False), + sa.Column('agency_name', sa.String(), nullable=True), sa.Column('state', sa.String(), nullable=True), sa.Column('county', sa.String(), nullable=True), sa.Column('locality', sa.String(), nullable=True), @@ -44,3 +62,8 @@ def upgrade() -> None: def downgrade() -> None: op.drop_table('url_agency_suggestions') suggestion_type_enum.drop(op.get_bind(), checkfirst=True) + old_task_type_enum.create(op.get_bind()) + op.execute("DELETE FROM TASKS;") + op.execute("ALTER TABLE tasks ALTER COLUMN task_type TYPE task_type_old USING task_type::text::task_type_old;") + new_task_type_enum.drop(op.get_bind(), checkfirst=True) + op.execute("ALTER TYPE task_type_old RENAME TO task_type;") diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 04d40a82..de3c7f3d 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -18,12 +18,14 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL, Task, TaskError, LinkTaskURL -from collector_manager.enums import URLStatus + RootURL, Task, TaskError, LinkTaskURL, URLAgencySuggestion, Batch +from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.enums import BatchStatus @@ -566,3 +568,58 @@ async def get_tasks( return GetTasksResponse( tasks=final_results ) + + @session_manager + async def has_urls_without_agency_suggestions( + self, + session: AsyncSession + ) -> bool: + statement = ( + select( + URL.id + )) + statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) + raw_result = await session.execute(statement) + result = raw_result.all() + return len(result) != 0 + + @session_manager + async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: + statement = ( + select( + URL.id, + URL.collector_metadata, + Batch.strategy, + ).join(Batch) + ) + statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) + statement = statement.limit(100) + raw_results = await session.execute(statement) + return [ + AgencyIdentificationTDO( + url_id=raw_result[0], + collector_metadata=raw_result[1], + collector_type=CollectorType(raw_result[2]) + ) + for raw_result in raw_results + ] + + @session_manager + async def add_agency_suggestions( + self, + session: AsyncSession, + suggestions: list[URLAgencySuggestionInfo] + ): + for suggestion in suggestions: + url_agency_suggestion = URLAgencySuggestion( + url_id=suggestion.url_id, + suggestion_type=suggestion.suggestion_type, + agency_id=suggestion.pdap_agency_id, + agency_name=suggestion.agency_name, + state=suggestion.state, + county=suggestion.county, + locality=suggestion.locality + ) + session.add(url_agency_suggestion) + + await session.commit() \ No newline at end of file diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index c042e10c..a04ed07f 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -2,7 +2,7 @@ from sqlalchemy import Select, select, exists, Table, func, Subquery from collector_db.enums import URLMetadataAttributeType, ValidationStatus -from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation +from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation, URLAgencySuggestion from collector_manager.enums import URLStatus @@ -57,3 +57,11 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: func.count(attr_value).label(label) ).group_by(attr_value).subquery() + @staticmethod + def exclude_urls_with_agency_suggestions( + statement: Select + ): + return (statement.where(~exists( + select(URLAgencySuggestion.id). + where(URLAgencySuggestion.url_id == URL.id) + ))) diff --git a/collector_db/enums.py b/collector_db/enums.py index a6f3c95e..2d82e87b 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -36,6 +36,7 @@ class TaskType(PyEnum): HTML = "HTML" RELEVANCY = "Relevancy" RECORD_TYPE = "Record Type" + AGENCY_IDENTIFICATION = "Agency Identification" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/collector_db/models.py b/collector_db/models.py index 6ca04846..41c50048 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -24,7 +24,13 @@ class Batch(Base): id = Column(Integer, primary_key=True) strategy = Column( postgresql.ENUM( - 'example', 'ckan', 'muckrock_county_search', 'auto_googler', 'muckrock_all_search', 'muckrock_simple_search', 'common_crawler', + 'example', + 'ckan', + 'muckrock_county_search', + 'auto_googler', + 'muckrock_all_search', + 'muckrock_simple_search', + 'common_crawler', name='batch_strategy'), nullable=False) user_id = Column(Integer, nullable=False) @@ -252,7 +258,11 @@ class Task(Base): id = Column(Integer, primary_key=True) task_type = Column( PGEnum( - 'HTML', 'Relevancy', 'Record Type', name='task_type' + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + name='task_type' ), nullable=False) task_status = Column(batch_status_enum, nullable=False) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) @@ -303,7 +313,8 @@ class URLAgencySuggestion(Base): url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) suggestion_type = Column( PGEnum( - 'Suggestion', + 'Auto Suggestion', + 'Manual Suggestion' 'Unknown', 'New Agency', 'Confirmed', @@ -312,7 +323,7 @@ class URLAgencySuggestion(Base): nullable=False ) agency_id = Column(Integer, nullable=True) - agency_name = Column(String, nullable=False) + agency_name = Column(String, nullable=True) state = Column(String, nullable=True) county = Column(String, nullable=True) locality = Column(String, nullable=True) diff --git a/core/DTOs/URLAgencySuggestionInfo.py b/core/DTOs/URLAgencySuggestionInfo.py new file mode 100644 index 00000000..9729cfb5 --- /dev/null +++ b/core/DTOs/URLAgencySuggestionInfo.py @@ -0,0 +1,15 @@ +from typing import Optional + +from pydantic import BaseModel + +from core.enums import SuggestionType + + +class URLAgencySuggestionInfo(BaseModel): + url_id: int + suggestion_type: SuggestionType + pdap_agency_id: Optional[int] = None + agency_name: Optional[str] = None + state: Optional[str] = None + county: Optional[str] = None + locality: Optional[str] = None diff --git a/core/DTOs/task_data_objects/AgencyIdentificationTDO.py b/core/DTOs/task_data_objects/AgencyIdentificationTDO.py new file mode 100644 index 00000000..10c3ce99 --- /dev/null +++ b/core/DTOs/task_data_objects/AgencyIdentificationTDO.py @@ -0,0 +1,11 @@ +from typing import Optional + +from pydantic import BaseModel + +from collector_manager.enums import CollectorType + + +class AgencyIdentificationTDO(BaseModel): + url_id: int + collector_metadata: Optional[dict] = None + collector_type: CollectorType diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/AgencyIdentificationTaskOperator.py new file mode 100644 index 00000000..2c027a0f --- /dev/null +++ b/core/classes/AgencyIdentificationTaskOperator.py @@ -0,0 +1,92 @@ +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo +from collector_db.enums import TaskType +from collector_manager.enums import CollectorType +from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO +from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask +from core.classes.subtasks.CKANAgencyIdentificationSubtask import CKANAgencyIdentificationSubtask +from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask +from core.classes.subtasks.MuckrockAgencyIdentificationSubtask import MuckrockAgencyIdentificationSubtask +from pdap_api_client.PDAPClient import PDAPClient + + +# TODO: Validate with Manual Tests + +class AgencyIdentificationTaskOperator(TaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient, + muckrock_api_interface: MuckrockAPIInterface, + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + self.muckrock_api_interface = muckrock_api_interface + + @property + def task_type(self): + return TaskType.AGENCY_IDENTIFICATION + + async def meets_task_prerequisites(self): + has_urls_without_agency_suggestions = await self.adb_client.has_urls_without_agency_suggestions() + return has_urls_without_agency_suggestions + + async def get_pending_urls_without_agency_identification(self): + return await self.adb_client.get_urls_without_agency_suggestions() + + async def get_muckrock_subtask(self): + return MuckrockAgencyIdentificationSubtask( + muckrock_api_interface=self.muckrock_api_interface, + pdap_client=self.pdap_client + ) + + async def get_subtask(self, collector_type: CollectorType): + match collector_type: + case CollectorType.MUCKROCK_SIMPLE_SEARCH: + return await self.get_muckrock_subtask() + case CollectorType.MUCKROCK_COUNTY_SEARCH: + return await self.get_muckrock_subtask() + case CollectorType.MUCKROCK_ALL_SEARCH: + return await self.get_muckrock_subtask() + case CollectorType.AUTO_GOOGLER: + return AutoGooglerAgencyIdentificationSubtask() + case CollectorType.COMMON_CRAWLER: + return CommonCrawlerAgencyIdentificationSubtask() + case CollectorType.CKAN: + return CKANAgencyIdentificationSubtask( + pdap_client=self.pdap_client + ) + + @staticmethod + async def run_subtask(subtask, url_id, collector_metadata): + return await subtask.run(url_id=url_id, collector_metadata=collector_metadata) + + async def inner_task_logic(self): + tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() + await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) + error_infos = [] + all_agency_suggestions = [] + for tdo in tdos: + subtask = await self.get_subtask(tdo.collector_type) + try: + new_agency_suggestions = await self.run_subtask( + subtask, + tdo.url_id, + tdo.collector_metadata + ) + all_agency_suggestions.extend(new_agency_suggestions) + except Exception as e: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_id, + error=str(e), + ) + error_infos.append(error_info) + + await self.adb_client.add_agency_suggestions(all_agency_suggestions) + await self.adb_client.add_url_error_infos(error_infos) + + diff --git a/core/classes/subtasks/AgencyIdentificationSubtaskBase.py b/core/classes/subtasks/AgencyIdentificationSubtaskBase.py new file mode 100644 index 00000000..755cade5 --- /dev/null +++ b/core/classes/subtasks/AgencyIdentificationSubtaskBase.py @@ -0,0 +1,16 @@ +import abc +from abc import ABC +from typing import Optional + +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo + + +class AgencyIdentificationSubtaskBase(ABC): + + @abc.abstractmethod + async def run( + self, + url_id: int, + collector_metadata: Optional[dict] = None + ) -> list[URLAgencySuggestionInfo]: + raise NotImplementedError diff --git a/core/classes/subtasks/AutoGooglerAgencyIdentificationSubtask.py b/core/classes/subtasks/AutoGooglerAgencyIdentificationSubtask.py new file mode 100644 index 00000000..1e5d945b --- /dev/null +++ b/core/classes/subtasks/AutoGooglerAgencyIdentificationSubtask.py @@ -0,0 +1,25 @@ +from typing import Optional + +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.classes.subtasks.AgencyIdentificationSubtaskBase import AgencyIdentificationSubtaskBase +from core.enums import SuggestionType + + +class AutoGooglerAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): + + async def run( + self, + url_id: int, + collector_metadata: Optional[dict] = None + ) -> list[URLAgencySuggestionInfo]: + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=None, + agency_name=None, + state=None, + county=None, + locality=None + ) + ] diff --git a/core/classes/subtasks/CKANAgencyIdentificationSubtask.py b/core/classes/subtasks/CKANAgencyIdentificationSubtask.py new file mode 100644 index 00000000..5eb88406 --- /dev/null +++ b/core/classes/subtasks/CKANAgencyIdentificationSubtask.py @@ -0,0 +1,29 @@ +from typing import Optional + +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.helpers import process_match_agency_response_to_suggestions +from pdap_api_client.PDAPClient import PDAPClient +from pdap_api_client.DTOs import MatchAgencyResponse + + +class CKANAgencyIdentificationSubtask: + + def __init__( + self, + pdap_client: PDAPClient + ): + self.pdap_client = pdap_client + + async def run( + self, + url_id: int, + collector_metadata: Optional[dict] + ) -> list[URLAgencySuggestionInfo]: + agency_name = collector_metadata["agency_name"] + match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_name + ) + return process_match_agency_response_to_suggestions( + url_id=url_id, + match_agency_response=match_agency_response + ) diff --git a/core/classes/subtasks/CommonCrawlerAgencyIdentificationSubtask.py b/core/classes/subtasks/CommonCrawlerAgencyIdentificationSubtask.py new file mode 100644 index 00000000..5d0fa409 --- /dev/null +++ b/core/classes/subtasks/CommonCrawlerAgencyIdentificationSubtask.py @@ -0,0 +1,23 @@ +from typing import Optional + +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.enums import SuggestionType + + +class CommonCrawlerAgencyIdentificationSubtask: + async def run( + self, + url_id: int, + collector_metadata: Optional[dict] + ) -> list[URLAgencySuggestionInfo]: + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=None, + agency_name=None, + state=None, + county=None, + locality=None + ) + ] diff --git a/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py new file mode 100644 index 00000000..03f2a064 --- /dev/null +++ b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py @@ -0,0 +1,42 @@ +from typing import Optional + +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponse, AgencyLookupResponseType +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.exceptions import MuckrockAPIError +from core.helpers import process_match_agency_response_to_suggestions +from pdap_api_client.PDAPClient import PDAPClient +from pdap_api_client.DTOs import MatchAgencyResponse + + +class MuckrockAgencyIdentificationSubtask: + + def __init__( + self, + muckrock_api_interface: MuckrockAPIInterface, + pdap_client: PDAPClient + ): + self.muckrock_api_interface = muckrock_api_interface + self.pdap_client = pdap_client + + async def run( + self, + url_id: int, + collector_metadata: Optional[dict] + ) -> list[URLAgencySuggestionInfo]: + muckrock_agency_id = collector_metadata["agency"] + agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( + muckrock_agency_id=muckrock_agency_id + ) + if agency_lookup_response.type != AgencyLookupResponseType.FOUND: + raise MuckrockAPIError( + f"Failed to lookup muckrock agency: {muckrock_agency_id}:" + f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" + ) + + match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_lookup_response.name + ) + return process_match_agency_response_to_suggestions( + url_id=url_id, + match_agency_response=match_agency_response + ) diff --git a/core/classes/subtasks/__init__.py b/core/classes/subtasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/enums.py b/core/enums.py index 605e49e5..213db47c 100644 --- a/core/enums.py +++ b/core/enums.py @@ -48,3 +48,11 @@ class RecordType(Enum): COURT_CASES = "Court Cases" INCARCERATION_RECORDS = "Incarceration Records" OTHER = "Other" + + +class SuggestionType(Enum): + AUTO_SUGGESTION = "Auto Suggestion" + MANUAL_SUGGESTION = "Manual Suggestion" + UNKNOWN = "Unknown" + NEW_AGENCY = "New Agency" + CONFIRMED = "Confirmed" diff --git a/core/exceptions.py b/core/exceptions.py index edaa32a3..d9685245 100644 --- a/core/exceptions.py +++ b/core/exceptions.py @@ -1,2 +1,10 @@ class InvalidPreprocessorError(Exception): pass + + +class MuckrockAPIError(Exception): + pass + + +class MatchAgencyError(Exception): + pass diff --git a/core/helpers.py b/core/helpers.py index e69de29b..bac603bd 100644 --- a/core/helpers.py +++ b/core/helpers.py @@ -0,0 +1,48 @@ +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.enums import SuggestionType +from core.exceptions import MatchAgencyError +from pdap_api_client.DTOs import MatchAgencyResponse +from pdap_api_client.enums import MatchAgencyResponseStatus + + +def process_match_agency_response_to_suggestions( + url_id: int, + match_agency_response: MatchAgencyResponse +) -> list[URLAgencySuggestionInfo]: + if match_agency_response.status == MatchAgencyResponseStatus.EXACT_MATCH: + match = match_agency_response.matches[0] + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=int(match.id), + agency_name=match.submitted_name, + state=match.state, + county=match.county, + ) + ] + if match_agency_response.status == MatchAgencyResponseStatus.NO_MATCH: + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + + if match_agency_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: + raise MatchAgencyError( + f"Unknown Match Agency Response Status: {match_agency_response.status}" + ) + + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=match.id, + agency_name=match.submitted_name, + state=match.state, + county=match.county, + locality=match.locality + ) + for match in match_agency_response.matches + ] diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index 6c6756d0..2b135516 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -1,5 +1,4 @@ import asyncio -import subprocess from typing import Optional from aiohttp import ClientSession @@ -7,16 +6,10 @@ from dataclasses import dataclass -from requests import Response from tqdm.asyncio import tqdm MAX_CONCURRENCY = 5 -@dataclass -class URLResponseInfoOld: - success: bool - response: Response or Exception - @dataclass class URLResponseInfo: success: bool diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py index 87877466..0776fe0d 100644 --- a/pdap_api_client/AccessManager.py +++ b/pdap_api_client/AccessManager.py @@ -2,15 +2,16 @@ from typing import Optional import requests +from aiohttp import ClientSession from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo API_URL = "https://data-sources-v2.pdap.dev/api" request_methods = { - RequestType.POST: requests.post, - RequestType.PUT: requests.put, - RequestType.GET: requests.get, - RequestType.DELETE: requests.delete, + RequestType.POST: ClientSession.post, + RequestType.PUT: ClientSession.put, + RequestType.GET: ClientSession.get, + RequestType.DELETE: ClientSession.delete, } @@ -32,15 +33,42 @@ class AccessManager: """ Manages login, api key, access and refresh tokens """ - def __init__(self, email: str, password: str, api_key: Optional[str] = None): - self.access_token = None - self.refresh_token = None + def __init__( + self, + session: ClientSession, + email: str, + password: str, + api_key: Optional[str] = None, + ): + self.session = session + self._access_token = None + self._refresh_token = None self.api_key = api_key + self.email = email + self.password = password self.login(email=email, password=password) + @property + async def access_token(self): + if self._access_token is None: + await self.login( + email=self.email, + password=self.password + ) + return self._access_token + + @property + async def refresh_token(self): + if self._refresh_token is None: + await self.login( + email=self.email, + password=self.password + ) + return self._refresh_token + # TODO: Add means to refresh if token expired. - def load_api_key(self): + async def load_api_key(self): url = build_url( namespace=Namespaces.AUTH, subdomains=["api-key"] @@ -48,41 +76,48 @@ def load_api_key(self): request_info = RequestInfo( type_ = RequestType.POST, url=url, - headers=self.jwt_header() + headers=await self.jwt_header() ) - response_info = self.make_request(request_info) + response_info = await self.make_request(request_info) self.api_key = response_info.data["api_key"] - def refresh_access_token(self): + async def refresh_access_token(self): url = build_url( namespace=Namespaces.AUTH, subdomains=["refresh-session"], ) - raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566") + refresh_token = await self.refresh_token + rqi = RequestInfo( + type_=RequestType.POST, + url=url, + json={"refresh_token": refresh_token}, + headers=await self.jwt_header() + ) + rsi = await self.make_request(rqi) + data = rsi.data + self._access_token = data['access_token'] + self._refresh_token = data['refresh_token'] - def make_request(self, ri: RequestInfo) -> ResponseInfo: + async def make_request(self, ri: RequestInfo) -> ResponseInfo: try: - response = request_methods[ri.type_]( - ri.url, - json=ri.json, - headers=ri.headers, - params=ri.params, - timeout=ri.timeout - ) - response.raise_for_status() + method = getattr(self.session, ri.type_.value.lower()) + async with method(**ri.kwargs()) as response: + response.raise_for_status() + json = await response.json() + return ResponseInfo( + status_code=HTTPStatus(response.status), + data=json + ) except requests.RequestException as e: # TODO: Precise string matching here is brittle. Consider changing later. - if e.response.json().message == "Token is expired. Please request a new token.": - self.refresh_access_token() - return self.make_request(ri) + if json.message == "Token is expired. Please request a new token.": + await self.refresh_access_token() + return await self.make_request(ri) else: raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") - return ResponseInfo( - status_code=HTTPStatus(response.status_code), - data=response.json() - ) - def login(self, email: str, password: str): + + async def login(self, email: str, password: str): url = build_url( namespace=Namespaces.AUTH, subdomains=["login"] @@ -95,19 +130,20 @@ def login(self, email: str, password: str): "password": password } ) - response_info = self.make_request(request_info) + response_info = await self.make_request(request_info) data = response_info.data - self.access_token = data["access_token"] - self.refresh_token = data["refresh_token"] + self._access_token = data["access_token"] + self._refresh_token = data["refresh_token"] - def jwt_header(self) -> dict: + async def jwt_header(self) -> dict: """ Retrieve JWT header Returns: Dictionary of Bearer Authorization with JWT key """ + access_token = await self.access_token return { - "Authorization": f"Bearer {self.access_token}" + "Authorization": f"Bearer {access_token}" } def api_key_header(self): diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index 31c8c2cf..19255a35 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -1,13 +1,18 @@ from enum import Enum from http import HTTPStatus -from typing import Optional +from typing import Optional, List from pydantic import BaseModel +from pdap_api_client.enums import MatchAgencyResponseStatus + class MatchAgencyInfo(BaseModel): + id: int submitted_name: str - id: str + state: Optional[str] = None + county: Optional[str] = None + locality: Optional[str] = None class ApprovalStatus(Enum): APPROVED = "approved" @@ -48,7 +53,22 @@ class RequestInfo(BaseModel): params: Optional[dict] = None timeout: Optional[int] = 10 + def kwargs(self) -> dict: + d = { + "url": self.url, + } + if self.json is not None: + d['json'] = self.json + if self.headers is not None: + d['headers'] = self.headers + return d + class ResponseInfo(BaseModel): status_code: HTTPStatus data: Optional[dict] + + +class MatchAgencyResponse(BaseModel): + status: MatchAgencyResponseStatus + matches: List[MatchAgencyInfo] diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index 6c03ce0f..b2b89564 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,22 +1,27 @@ -from typing import List +from typing import Optional from pdap_api_client.AccessManager import build_url, AccessManager from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ - RequestType, RequestInfo + RequestType, RequestInfo, MatchAgencyResponse +from pdap_api_client.enums import MatchAgencyResponseStatus class PDAPClient: - def __init__(self, access_manager: AccessManager): + def __init__( + self, + access_manager: AccessManager, + ): self.access_manager = access_manager - def match_agency( + async def match_agency( self, name: str, - state: str, - county: str, - locality: str - ) -> List[MatchAgencyInfo]: + state: Optional[str] = None, + county: Optional[str] = None, + locality: Optional[str] = None + ) -> MatchAgencyResponse: + # TODO: Change to async """ Returns agencies, if any, that match or partially match the search criteria """ @@ -24,9 +29,12 @@ def match_agency( namespace=Namespaces.MATCH, subdomains=["agency"] ) + headers = await self.access_manager.jwt_header() + headers['Content-Type'] = "application/json" request_info = RequestInfo( type_=RequestType.POST, url=url, + headers=headers, json={ "name": name, "state": state, @@ -34,11 +42,24 @@ def match_agency( "locality": locality } ) - response_info = self.access_manager.make_request(request_info) - return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]] + response_info = await self.access_manager.make_request(request_info) + + matches = [ + MatchAgencyInfo( + id = agency['id'], + submitted_name=agency['name'], + state=agency['state'], + county=agency['county'], + locality=agency['locality'] + ) + for agency in response_info.data["agencies"]] + return MatchAgencyResponse( + status=MatchAgencyResponseStatus(response_info.data["status"]), + matches=matches + ) - def is_url_unique( + async def is_url_unique( self, url_to_check: str ) -> UniqueURLResponseInfo: @@ -56,7 +77,7 @@ def is_url_unique( "url": url_to_check } ) - response_info = self.access_manager.make_request(request_info) + response_info = await self.access_manager.make_request(request_info) duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] is_unique = (len(duplicates) == 0) return UniqueURLResponseInfo( diff --git a/pdap_api_client/enums.py b/pdap_api_client/enums.py new file mode 100644 index 00000000..3dc7d931 --- /dev/null +++ b/pdap_api_client/enums.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class MatchAgencyResponseStatus(Enum): + EXACT_MATCH = "Exact Match" + PARTIAL_MATCH = "Partial Matches" + NO_MATCH = "No Match" diff --git a/requirements.txt b/requirements.txt index b72804e7..48f86981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,3 +46,4 @@ PyJWT~=2.10.1 pytest-timeout~=2.3.1 openai~=1.60.1 +aiohttp~=3.11.11 \ No newline at end of file diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 0041fad5..c9a6b31a 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,5 +1,7 @@ from typing import List, Optional +from pydantic import BaseModel + from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo @@ -15,6 +17,10 @@ from tests.helpers.simple_test_data_functions import generate_test_urls +class BatchURLCreationInfo(BaseModel): + batch_id: int + url_ids: list[int] + class DBDataCreator: """ Assists in the creation of test data @@ -23,10 +29,10 @@ def __init__(self, db_client: DatabaseClient = DatabaseClient()): self.db_client = db_client self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() - def batch(self): + def batch(self, strategy: CollectorType = CollectorType.EXAMPLE) -> int: return self.db_client.insert_batch( BatchInfo( - strategy=CollectorType.EXAMPLE.value, + strategy=strategy.value, status=BatchStatus.IN_PROCESS, total_url_count=1, parameters={"test_key": "test_value"}, @@ -40,6 +46,22 @@ async def task(self, url_ids: Optional[list[int]] = None) -> int: await self.adb_client.link_urls_to_task(task_id=task_id, url_ids=url_ids) return task_id + async def batch_and_urls( + self, + strategy: CollectorType = CollectorType.EXAMPLE, + url_count: int = 1, + with_html_content: bool = False + ) -> BatchURLCreationInfo: + batch_id = self.batch(strategy=strategy) + iuis: InsertURLsInfo = self.urls(batch_id=batch_id, url_count=url_count) + url_ids = [iui.url_id for iui in iuis.url_mappings] + if with_html_content: + await self.html_data(url_ids) + + return BatchURLCreationInfo(batch_id=batch_id, url_ids=url_ids) + + + def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: raw_urls = generate_test_urls(url_count) url_infos: List[URLInfo] = [] diff --git a/tests/manual/agency_identifier/__init__.py b/tests/manual/agency_identifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/agency_identifier/test_muckrock_api_interface.py b/tests/manual/agency_identifier/test_muckrock_api_interface.py new file mode 100644 index 00000000..2dac6bd4 --- /dev/null +++ b/tests/manual/agency_identifier/test_muckrock_api_interface.py @@ -0,0 +1,16 @@ +import pytest +from aiohttp import ClientSession + +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface + + +@pytest.mark.asyncio +async def test_muckrock_api_interface(): + + async with ClientSession() as session: + muckrock_api_interface = MuckrockAPIInterface(session=session) + + response = await muckrock_api_interface.lookup_agency( + muckrock_agency_id=1 + ) + print(response) diff --git a/tests/manual/pdap_client/__init__.py b/tests/manual/pdap_client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/pdap_client/test_access_manager.py b/tests/manual/pdap_client/test_access_manager.py new file mode 100644 index 00000000..ff08ee0e --- /dev/null +++ b/tests/manual/pdap_client/test_access_manager.py @@ -0,0 +1,23 @@ +import pytest +from aiohttp import ClientSession + +from pdap_api_client.AccessManager import AccessManager +from util.helper_functions import get_from_env + + +@pytest.mark.asyncio +async def test_refresh_session(): + async with ClientSession() as session: + access_manager = AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY", allow_none=True), + session=session + ) + old_access_token = await access_manager.access_token + old_refresh_token = await access_manager.refresh_token + await access_manager.refresh_access_token() + new_access_token = await access_manager.access_token + new_refresh_token = await access_manager.refresh_token + assert old_access_token != new_access_token + assert old_refresh_token != new_refresh_token diff --git a/tests/manual/pdap_client/test_pdap_client.py b/tests/manual/pdap_client/test_pdap_client.py new file mode 100644 index 00000000..b1232244 --- /dev/null +++ b/tests/manual/pdap_client/test_pdap_client.py @@ -0,0 +1,23 @@ +import pytest +from aiohttp import ClientSession + +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.PDAPClient import PDAPClient +from util.helper_functions import get_from_env + + +@pytest.mark.asyncio +async def test_match_agency(): + + async with ClientSession() as session: + access_manager = AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY", allow_none=True), + session=session + ) + pdap_client = PDAPClient(access_manager=access_manager) + + response = await pdap_client.match_agency(name="police") + + print(response) diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 5ff2f239..f6703d29 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -195,7 +195,7 @@ def post_record_type_annotation_and_get_next( ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/record-type/{metadata_id}", - json=record_type_annotation_post_info.model_dump() + json=record_type_annotation_post_info.model_dump(mode="json") ) return GetNextURLForAnnotationResponse(**data) @@ -206,7 +206,7 @@ def post_relevance_annotation_and_get_next( ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/relevance/{metadata_id}", - json=relevance_annotation_post_info.model_dump() + json=relevance_annotation_post_info.model_dump(mode="json") ) return GetNextURLForAnnotationResponse(**data) diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 899c7f28..1ee03963 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -78,9 +78,9 @@ async def run_annotation_test( assert results[0].value == expected_metadata_value # Submit this one in turn, and no subsequent annotation info should be returned - request_info_3 = ath.request_validator.post_relevance_annotation_and_get_next( - metadata_id=inner_info_2.metadata_id, - relevance_annotation_post_info=post_info + request_info_3 = submit_and_get_next_function( + inner_info_2.metadata_id, + post_info ) assert request_info_3.next_annotation is None diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py new file mode 100644 index 00000000..94f6c1d3 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -0,0 +1,255 @@ +import types +from typing import Optional +from unittest.mock import MagicMock, AsyncMock, patch + +import pytest +from aiohttp import ClientSession + +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse +from collector_manager.enums import CollectorType +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask +from core.classes.subtasks.CKANAgencyIdentificationSubtask import CKANAgencyIdentificationSubtask +from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask +from core.classes.subtasks.MuckrockAgencyIdentificationSubtask import MuckrockAgencyIdentificationSubtask +from core.enums import SuggestionType +from helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo +from pdap_api_client.PDAPClient import PDAPClient +from pdap_api_client.enums import MatchAgencyResponseStatus + + +@pytest.mark.asyncio +async def test_agency_preannotation_task(db_data_creator: DBDataCreator): + async def mock_run_subtask( + subtask, + url_id: int, + collector_metadata: Optional[dict] + ): + return [ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=None, + agency_name=None, + state=None, + county=None, + locality=None + ) + ] + + async with ClientSession() as session: + mock = MagicMock() + access_manager = AccessManager( + email=mock.email, + password=mock.password, + api_key=mock.api_key, + session=session + ) + pdap_client = PDAPClient( + access_manager=access_manager + ) + muckrock_api_interface = MuckrockAPIInterface(session=session) + with patch.object( + AgencyIdentificationTaskOperator, + "run_subtask", + side_effect=mock_run_subtask, + ) as mock: + operator = AgencyIdentificationTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=pdap_client, + muckrock_api_interface=muckrock_api_interface + ) + + # Try to run initially and confirm it doesn't run + # due to not meeting prerequisites + await operator.run_task() + + d = {} + + # Create six urls, one from each strategy + for strategy in [ + CollectorType.COMMON_CRAWLER, + CollectorType.AUTO_GOOGLER, + CollectorType.MUCKROCK_COUNTY_SEARCH, + CollectorType.MUCKROCK_SIMPLE_SEARCH, + CollectorType.MUCKROCK_ALL_SEARCH, + CollectorType.CKAN + ]: + creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(strategy=strategy, url_count=1, with_html_content=True) + d[strategy] = creation_info.url_ids[0] + + # Run task + await operator.run_task() + + # Confirm tasks are piped into the correct subtasks + # * common_crawler into common_crawler_subtask + # * auto_googler into auto_googler_subtask + # * muckrock_county_search into muckrock_subtask + # * muckrock_simple_search into muckrock_subtask + # * muckrock_all_search into muckrock_subtask + # * ckan into ckan_subtask + + assert mock.call_count == 6 + + + # Confirm subtask classes are correct for the given urls + d2 = {} + for call_arg in mock.call_args_list: + subtask_class = call_arg[0][0].__class__ + url_id = call_arg[0][1] + d2[url_id] = subtask_class + + + subtask_class_collector_type = [ + (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_ALL_SEARCH), + (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_COUNTY_SEARCH), + (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_SIMPLE_SEARCH), + (CKANAgencyIdentificationSubtask, CollectorType.CKAN), + (CommonCrawlerAgencyIdentificationSubtask, CollectorType.COMMON_CRAWLER), + (AutoGooglerAgencyIdentificationSubtask, CollectorType.AUTO_GOOGLER) + ] + + for subtask_class, collector_type in subtask_class_collector_type: + url_id = d[collector_type] + assert d2[url_id] == subtask_class + + # Run task again and confirm it doesn't call any additional subtasks + await operator.run_task() + + assert mock.call_count == 6 + +@pytest.mark.asyncio +async def test_common_crawler_subtask(db_data_creator: DBDataCreator): + # Test that common_crawler subtask correctly adds URL to + # url_agency_suggestions with label 'Unknown' + subtask = CommonCrawlerAgencyIdentificationSubtask() + results: list[URLAgencySuggestionInfo] = await subtask.run(url_id=1, collector_metadata={}) + assert len(results) == 1 + assert results[0].url_id == 1 + assert results[0].suggestion_type == SuggestionType.UNKNOWN + + +@pytest.mark.asyncio +async def test_auto_googler_subtask(db_data_creator: DBDataCreator): + # Test that auto_googler subtask correctly adds URL to + # url_agency_suggestions with label 'Unknown' + subtask = AutoGooglerAgencyIdentificationSubtask() + results: list[URLAgencySuggestionInfo] = await subtask.run(url_id=1, collector_metadata={}) + assert len(results) == 1 + assert results[0].url_id == 1 + assert results[0].suggestion_type == SuggestionType.UNKNOWN + +@pytest.mark.asyncio +async def test_muckrock_subtask(db_data_creator: DBDataCreator): + # Test that muckrock subtask correctly sends agency name to + # MatchAgenciesInterface and adds received suggestions to + # url_agency_suggestions + + # Create mock instances for dependency injections + muckrock_api_interface_mock = MagicMock(spec=MuckrockAPIInterface) + pdap_client_mock = MagicMock(spec=PDAPClient) + + # Set up mock return values for method calls + muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( + type=AgencyLookupResponseType.FOUND, + name="Mock Agency Name", + error=None + ) + + pdap_client_mock.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) + + # Create an instance of MuckrockAgencyIdentificationSubtask with mock dependencies + muckrock_agency_identification_subtask = MuckrockAgencyIdentificationSubtask( + muckrock_api_interface=muckrock_api_interface_mock, + pdap_client=pdap_client_mock + ) + + # Run the subtask + results: list[URLAgencySuggestionInfo] = await muckrock_agency_identification_subtask.run( + url_id=1, + collector_metadata={ + "agency": 123 + } + ) + + # Verify the results + assert len(results) == 2 + assert results[0].url_id == 1 + assert results[0].suggestion_type == SuggestionType.AUTO_SUGGESTION + assert results[0].pdap_agency_id == 1 + assert results[0].agency_name == "Mock Agency Name" + assert results[1].url_id == 1 + assert results[1].suggestion_type == SuggestionType.AUTO_SUGGESTION + assert results[1].pdap_agency_id == 2 + assert results[1].agency_name == "Another Mock Agency Name" + + # Assert methods called as expected + muckrock_api_interface_mock.lookup_agency.assert_called_once_with( + muckrock_agency_id=123 + ) + pdap_client_mock.match_agency.assert_called_once_with( + name="Mock Agency Name" + ) + + +@pytest.mark.asyncio +async def test_ckan_subtask(db_data_creator: DBDataCreator): + # Test that ckan subtask correctly sends agency id to + # CKANAPIInterface, sends resultant agency name to + # PDAPClient and adds received suggestions to + # url_agency_suggestions + + pdap_client = AsyncMock() + pdap_client.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) # Assuming MatchAgencyResponse is a class + + # Create an instance of CKANAgencyIdentificationSubtask + task = CKANAgencyIdentificationSubtask(pdap_client) + + # Call the run method with static values + collector_metadata = {"agency_name": "Test Agency"} + url_id = 1 + + # Call the run method + result = await task.run(url_id, collector_metadata) + + # Check the result + assert len(result) == 2 + assert result[0].url_id == 1 + assert result[0].suggestion_type == SuggestionType.AUTO_SUGGESTION + assert result[0].pdap_agency_id == 1 + assert result[0].agency_name == "Mock Agency Name" + assert result[1].url_id == 1 + assert result[1].suggestion_type == SuggestionType.AUTO_SUGGESTION + assert result[1].pdap_agency_id == 2 + assert result[1].agency_name == "Another Mock Agency Name" + + # Assert methods called as expected + pdap_client.match_agency.assert_called_once_with(name="Test Agency") + diff --git a/util/helper_functions.py b/util/helper_functions.py index ccc7d96e..bf72d39b 100644 --- a/util/helper_functions.py +++ b/util/helper_functions.py @@ -9,10 +9,10 @@ def get_enum_values(enum: Type[Enum]): return [item.value for item in enum] -def get_from_env(key: str): +def get_from_env(key: str, allow_none: bool = False): load_dotenv() val = os.getenv(key) - if val is None: + if val is None and not allow_none: raise ValueError(f"Environment variable {key} is not set") return val From ff1999f80d72d4522d695baa420fdd3516579d2c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 5 Feb 2025 09:59:54 -0500 Subject: [PATCH 042/182] Add alembic directory --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index fae4de32..6cf1d6a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,6 +25,7 @@ COPY hugging_face/HuggingFaceInterface.py ./hugging_face/HuggingFaceInterface.py COPY source_collectors ./source_collectors COPY util ./util COPY alembic.ini ./alembic.ini +COPY alembic ./alembic COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager COPY execute.sh ./execute.sh From 83f25cb39e749eda3d817695581076d59f22e599 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 5 Feb 2025 10:58:36 -0500 Subject: [PATCH 043/182] Add logic for Agency Identification task --- core/AsyncCore.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 6ab9fcf5..5e0a9590 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,5 +1,8 @@ import logging +from aiohttp import ClientSession + +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo @@ -8,6 +11,7 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo +from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator @@ -17,6 +21,9 @@ from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.PDAPClient import PDAPClient +from util.helper_functions import get_from_env class AsyncCore: @@ -26,7 +33,7 @@ def __init__( adb_client: AsyncDatabaseClient, huggingface_interface: HuggingFaceInterface, url_request_interface: URLRequestInterface, - html_parser: HTMLResponseParser + html_parser: HTMLResponseParser, ): self.adb_client = adb_client self.huggingface_interface = huggingface_interface @@ -60,10 +67,30 @@ async def run_url_record_type_task(self): ) await operator.run_task() + async def run_agency_identification_task(self): + self.logger.info("Running Agency Identification Task") + async with ClientSession() as session: + pdap_client = PDAPClient( + access_manager=AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY"), + session=session + ), + ) + muckrock_api_interface = MuckrockAPIInterface(session=session) + operator = AgencyIdentificationTaskOperator( + adb_client=self.adb_client, + pdap_client=pdap_client, + muckrock_api_interface=muckrock_api_interface + ) + await operator.run_task() + async def run_tasks(self): await self.run_url_html_task() await self.run_url_relevance_huggingface_task() await self.run_url_record_type_task() + await self.run_agency_identification_task() async def convert_to_annotation_request_info(self, url_info: URLAnnotationInfo) -> AnnotationRequestInfo: response_html_info = convert_to_response_html_info( From 2e5bc3688f7253d77add7df4228cdcef1c69bfac Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 5 Feb 2025 10:59:12 -0500 Subject: [PATCH 044/182] Add `user_id` and trigger enforcement logic for url_agency_suggestion --- ...ae_add_user_url_agency_suggestions_and_.py | 63 +++++++++++++++++++ collector_db/models.py | 3 +- tests/test_alembic/test_revisions.py | 14 +++++ .../collector_db/test_database_structure.py | 14 +++++ 4 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py diff --git a/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py b/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py new file mode 100644 index 00000000..8eadb6a3 --- /dev/null +++ b/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py @@ -0,0 +1,63 @@ +"""Add user_url_agency_suggestions and trigger + +Revision ID: 8c44e02733ae +Revises: 19bf57df581a +Create Date: 2025-02-05 10:33:46.002025 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import Column, Integer +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '8c44e02733ae' +down_revision: Union[str, None] = '19bf57df581a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + table_name='url_agency_suggestions', + column=Column( + name="user_id", + type_=Integer, + nullable=True + ) + ) + + op.execute( + """ + CREATE OR REPLACE FUNCTION user_url_agency_suggestions_value() + RETURNS TRIGGER AS $$ + BEGIN + IF NEW.suggestion_type = 'Manual Suggestion' and NEW.user_id IS NULL THEN + RAISE EXCEPTION 'User ID must not be null when suggestion type is "Manual Suggestion"'; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + + CREATE TRIGGER enforce_url_agency_suggestions_manual_suggestion_user_id + BEFORE INSERT ON url_agency_suggestions + FOR EACH ROW + EXECUTE FUNCTION user_url_agency_suggestions_value(); + + """ + ) + + +def downgrade() -> None: + op.drop_column( + table_name='url_agency_suggestions', + column_name="user_id" + ) + op.execute( + """ + DROP TRIGGER IF EXISTS enforce_url_agency_suggestions_manual_suggestion_user_id; + DROP FUNCTION IF EXISTS user_url_agency_suggestions_value(); + """ + ) diff --git a/collector_db/models.py b/collector_db/models.py index 41c50048..81d9d2c3 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -327,7 +327,8 @@ class URLAgencySuggestion(Base): state = Column(String, nullable=True) county = Column(String, nullable=True) locality = Column(String, nullable=True) + user_id = Column(Integer, nullable=True) updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) # Relationships - url = relationship("URL", back_populates="agency_suggestions") \ No newline at end of file + url = relationship("URL", back_populates="agency_suggestions") diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index e94f4180..51096ec5 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -345,3 +345,17 @@ def test_add_url_agency_suggestions(alembic_runner): start_revision="072b32a45b1c", end_revision="19bf57df581a" ) + +def test_add_user_url_agency_suggestions(alembic_runner): + def column_check() -> bool: + return columns_in_table( + alembic_runner, + table_name="url_agency_suggestions", + columns_to_check=["user_id"] + ) + + alembic_runner.upgrade("19bf57df581a") + assert not column_check() + alembic_runner.reflect() + alembic_runner.upgrade("8c44e02733ae") + assert column_check() diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 272f3de2..462f3b24 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -326,3 +326,17 @@ def test_root_url(db_data_creator: DBDataCreator): ) table_tester.run_column_tests() + +@pytest.mark.asyncio +async def test_url_agency_suggestions_trigger(db_data_creator: DBDataCreator): + # Check that if an entry is added to the user_url_agency_suggestions table, + # The trigger checks that the corresponding entry `url_agency_suggestions` has value 'Manual Suggestion' + # And raises an error if not + dbdc = db_data_creator + await dbdc.batch_and_urls( + with_html_content=True + ) + # Insert + + + pytest.fail("Not implemented") \ No newline at end of file From 5975ce647868d885190437a88dd91b99ed9c4521 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 5 Feb 2025 10:59:23 -0500 Subject: [PATCH 045/182] Add first draft of agency annotation api endpoint --- api/routes/annotate.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 27b21708..04641878 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -73,3 +73,30 @@ async def annotate_url_for_record_type_and_get_next_url( metadata_type=URLMetadataAttributeType.RECORD_TYPE ) return result + +async def get_next_url_for_agency_annotation( + access_info: AccessInfo = Depends(get_access_info), + async_core: AsyncCore = Depends(get_async_core), +) -> GetNextURLForAnnotationResponse: + result = await async_core.get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_type=URLMetadataAttributeType.AGENCY + ) + return result + +async def annotate_url_for_agency_and_get_next_url( + agency_annotation_post_info: RecordTypeAnnotationPostInfo, + metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +) -> GetNextURLForAnnotationResponse: + """ + Post URL annotation and get next URL to annotate + """ + result = await async_core.submit_and_get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_id=metadata_id, + annotation=agency_annotation_post_info.agency.value, + metadata_type=URLMetadataAttributeType.AGENCY + ) + return result \ No newline at end of file From d8076fd2e3bc485f49a881047ba0bcb2cb27e738 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 7 Feb 2025 15:59:53 -0500 Subject: [PATCH 046/182] Begin draft on revision of agency identification task --- ...daf0_revise_agency_identification_logic.py | 107 ++++++++++++++++++ api/routes/annotate.py | 23 ++-- collector_db/AsyncDatabaseClient.py | 59 +++++++--- collector_db/StatementComposer.py | 18 ++- collector_db/models.py | 83 ++++++++++---- core/AsyncCore.py | 29 ++++- .../GetNextURLForAgencyAnnotationResponse.py | 22 ++++ core/DTOs/URLAgencySuggestionInfo.py | 1 + .../AgencyIdentificationTaskOperator.py | 9 +- tests/test_alembic/AlembicRunner.py | 7 ++ tests/test_alembic/test_revisions.py | 16 +++ .../collector_db/test_database_structure.py | 82 ++++++++++++-- .../tasks/test_agency_preannotation_task.py | 64 ++++++++--- 13 files changed, 447 insertions(+), 73 deletions(-) create mode 100644 alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py create mode 100644 core/DTOs/GetNextURLForAgencyAnnotationResponse.py diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py new file mode 100644 index 00000000..ebe6cdf5 --- /dev/null +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -0,0 +1,107 @@ +"""Revise agency identification logic + +Revision ID: d7eb670edaf0 +Revises: 8c44e02733ae +Create Date: 2025-02-07 13:10:41.181578 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from collector_db.enums import PGEnum + +# revision identifiers, used by Alembic. +revision: str = 'd7eb670edaf0' +down_revision: Union[str, None] = '8c44e02733ae' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +suggestion_type_enum = PGEnum( + 'Auto Suggestion', + 'Manual Suggestion', + 'Unknown', + 'New Agency', + 'Confirmed', name='url_agency_suggestion_type' +) + +def upgrade(): + # Create agencies table + op.create_table( + "agencies", + sa.Column("agency_id", sa.Integer(), primary_key=True), + sa.Column("name", sa.String(), nullable=False), + sa.Column("state", sa.String(), nullable=True), + sa.Column("county", sa.String(), nullable=True), + sa.Column("locality", sa.String(), nullable=True), + sa.Column("updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()), + ) + + # Create confirmed_url_agency table + op.create_table( + "confirmed_url_agency", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("agency_id", sa.Integer(), sa.ForeignKey("agencies.agency_id"), nullable=False), + sa.Column("url_id", sa.Integer(), sa.ForeignKey("urls.id"), nullable=False), + ) + op.create_unique_constraint( + "uq_confirmed_url_agency", "confirmed_url_agency", ["agency_id", "url_id"] + ) + + # Create automated_url_agency_suggestions table + op.create_table( + "automated_url_agency_suggestions", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("agency_id", sa.Integer(), sa.ForeignKey("agencies.agency_id"), nullable=False), + sa.Column("url_id", sa.Integer(), sa.ForeignKey("urls.id"), nullable=False), + sa.Column("is_unknown", sa.Boolean(), nullable=True), + ) + op.create_unique_constraint( + "uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", ["agency_id", "url_id"] + ) + + # Create user_url_agency_suggestions table + op.create_table( + "user_url_agency_suggestions", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("agency_id", sa.Integer(), sa.ForeignKey("agencies.agency_id"), nullable=True), + sa.Column("url_id", sa.Integer(), sa.ForeignKey("urls.id"), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column("is_new", sa.Boolean(), nullable=True), + ) + op.create_unique_constraint( + "uq_user_url_agency_suggestions", "user_url_agency_suggestions", ["agency_id", "url_id", "user_id"] + ) + + op.drop_table('url_agency_suggestions') + suggestion_type_enum.drop(op.get_bind(), checkfirst=True) + + + +def downgrade(): + # Drop constraints first + op.drop_constraint("uq_confirmed_url_agency", "confirmed_url_agency", type_="unique") + op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") + op.drop_constraint("uq_user_url_agency_suggestions", "user_url_agency_suggestions", type_="unique") + + # Drop tables + op.drop_table("user_url_agency_suggestions") + op.drop_table("automated_url_agency_suggestions") + op.drop_table("confirmed_url_agency") + op.drop_table("agencies") + + op.create_table('url_agency_suggestions', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.Column('suggestion_type', suggestion_type_enum, nullable=False), + sa.Column('agency_id', sa.Integer(), nullable=True), + sa.Column('agency_name', sa.String(), nullable=True), + sa.Column('state', sa.String(), nullable=True), + sa.Column('county', sa.String(), nullable=True), + sa.Column('locality', sa.String(), nullable=True), + sa.Column('updated_at', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ), + sa.PrimaryKeyConstraint('id') + ) + diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 04641878..980c16f9 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -3,6 +3,8 @@ from api.dependencies import get_async_core from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo @@ -77,26 +79,27 @@ async def annotate_url_for_record_type_and_get_next_url( async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAnnotationResponse: - result = await async_core.get_next_url_for_annotation( +) -> GetNextURLForAgencyAnnotationResponse: + result = await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, - metadata_type=URLMetadataAttributeType.AGENCY ) return result async def annotate_url_for_agency_and_get_next_url( - agency_annotation_post_info: RecordTypeAnnotationPostInfo, - metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + url_id: int, + agency_annotation_post_info: URLAgencyAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) -) -> GetNextURLForAnnotationResponse: +) -> GetNextURLForAgencyAnnotationResponse: """ Post URL annotation and get next URL to annotate """ - result = await async_core.submit_and_get_next_url_for_annotation( + await async_core.submit_url_agency_annotation( + user_id=access_info.user_id, + url_id=url_id, + agency_post_info=agency_annotation_post_info + ) + result = await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, - metadata_id=metadata_id, - annotation=agency_annotation_post_info.agency.value, - metadata_type=URLMetadataAttributeType.AGENCY ) return result \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index de3c7f3d..dbc2d3dc 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -3,7 +3,7 @@ from sqlalchemy import select, exists, func from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload +from sqlalchemy.orm import selectinload, aliased from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.MetadataAnnotationInfo import MetadataAnnotationInfo @@ -18,7 +18,7 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL, Task, TaskError, LinkTaskURL, URLAgencySuggestion, Batch + RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ @@ -26,7 +26,7 @@ from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO -from core.enums import BatchStatus +from core.enums import BatchStatus, SuggestionType def add_standard_limit_and_offset(statement, page, limit=100): @@ -585,12 +585,18 @@ async def has_urls_without_agency_suggestions( @session_manager async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: + """ + Retrieve URLs without confirmed or suggested agencies + Args: + session: + + Returns: + + """ + statement = ( - select( - URL.id, - URL.collector_metadata, - Batch.strategy, - ).join(Batch) + select(URL.id, URL.collector_metadata, Batch.strategy) + .join(Batch) ) statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) statement = statement.limit(100) @@ -605,21 +611,48 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li ] @session_manager - async def add_agency_suggestions( + async def upsert_new_agencies( self, session: AsyncSession, suggestions: list[URLAgencySuggestionInfo] ): + """ + Add or update agencies in the database + """ for suggestion in suggestions: - url_agency_suggestion = URLAgencySuggestion( - url_id=suggestion.url_id, - suggestion_type=suggestion.suggestion_type, + agency = Agency( agency_id=suggestion.pdap_agency_id, - agency_name=suggestion.agency_name, + name=suggestion.agency_name, state=suggestion.state, county=suggestion.county, locality=suggestion.locality ) + await session.merge(agency) + + async def add_confirmed_agency_url_links( + self, + session: AsyncSession, + suggestions: list[URLAgencySuggestionInfo] + ): + for suggestion in suggestions: + confirmed_agency_url_link = ConfirmedUrlAgency( + agency_id=suggestion.pdap_agency_id, + url_id=suggestion.url_id + ) + session.add(confirmed_agency_url_link) + + @session_manager + async def add_agency_auto_suggestions( + self, + session: AsyncSession, + suggestions: list[URLAgencySuggestionInfo] + ): + for suggestion in suggestions: + url_agency_suggestion = AutomatedUrlAgencySuggestion( + url_id=suggestion.url_id, + agency_id=suggestion.pdap_agency_id, + is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN + ) session.add(url_agency_suggestion) await session.commit() \ No newline at end of file diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index a04ed07f..fd1b11a9 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,8 +1,10 @@ from sqlalchemy import Select, select, exists, Table, func, Subquery +from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus -from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation, URLAgencySuggestion +from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation, AutomatedUrlAgencySuggestion, \ + ConfirmedUrlAgency from collector_manager.enums import URLStatus @@ -61,7 +63,13 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: def exclude_urls_with_agency_suggestions( statement: Select ): - return (statement.where(~exists( - select(URLAgencySuggestion.id). - where(URLAgencySuggestion.url_id == URL.id) - ))) + # Aliases for clarity + AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) + ConfirmedAgency = aliased(ConfirmedUrlAgency) + + statement = (statement + .where(~exists().where(AutomatedSuggestion.url_id == URL.id)) # Exclude if automated suggestions exist + .where(~exists().where(ConfirmedAgency.url_id == URL.id)) + ) # Exclude if confirmed agencies exist + + return statement \ No newline at end of file diff --git a/collector_db/models.py b/collector_db/models.py index 81d9d2c3..064cf660 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -1,7 +1,8 @@ """ SQLAlchemy ORM models """ -from sqlalchemy import func, Column, Integer, String, TIMESTAMP, Float, JSON, ForeignKey, Text, UniqueConstraint +from sqlalchemy import func, Column, Integer, String, TIMESTAMP, Float, JSON, ForeignKey, Text, UniqueConstraint, \ + Boolean, DateTime from sqlalchemy.dialects import postgresql from sqlalchemy.orm import declarative_base, relationship @@ -96,7 +97,9 @@ class URL(Base): secondary="link_task_urls", back_populates="urls", ) - agency_suggestions = relationship("URLAgencySuggestion", back_populates="url", cascade="all, delete-orphan") + automated_agency_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="url") + user_agency_suggestions = relationship("UserUrlAgencySuggestion", back_populates="url") + confirmed_agencies = relationship("ConfirmedUrlAgency", back_populates="url") # URL Metadata table definition @@ -306,29 +309,65 @@ class TaskError(Base): name="uq_task_id_error"), ) -class URLAgencySuggestion(Base): - __tablename__ = 'url_agency_suggestions' +class Agency(Base): + __tablename__ = "agencies" - id = Column(Integer, primary_key=True) - url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) - suggestion_type = Column( - PGEnum( - 'Auto Suggestion', - 'Manual Suggestion' - 'Unknown', - 'New Agency', - 'Confirmed', - name='url_agency_suggestion_type' - ), - nullable=False - ) - agency_id = Column(Integer, nullable=True) - agency_name = Column(String, nullable=True) + agency_id = Column(Integer, primary_key=True) + name = Column(String, nullable=False) state = Column(String, nullable=True) county = Column(String, nullable=True) locality = Column(String, nullable=True) - user_id = Column(Integer, nullable=True) - updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + updated_at = Column(DateTime, nullable=False, default=func.now()) # Relationships - url = relationship("URL", back_populates="agency_suggestions") + confirmed_urls = relationship("ConfirmedUrlAgency", back_populates="agency") + automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") + user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") + + +class ConfirmedUrlAgency(Base): + __tablename__ = "confirmed_url_agency" + + id = Column(Integer, primary_key=True, autoincrement=True) + agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + + agency = relationship("Agency", back_populates="confirmed_urls") + url = relationship("URL", back_populates="confirmed_agencies") + + __table_args__ = ( + UniqueConstraint("agency_id", "url_id", name="uq_confirmed_url_agency"), + ) + + +class AutomatedUrlAgencySuggestion(Base): + __tablename__ = "automated_url_agency_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + is_unknown = Column(Boolean, nullable=True) + + agency = relationship("Agency", back_populates="automated_suggestions") + url = relationship("URL", back_populates="automated_agency_suggestions") + + __table_args__ = ( + UniqueConstraint("agency_id", "url_id", name="uq_automated_url_agency_suggestions"), + ) + + +class UserUrlAgencySuggestion(Base): + __tablename__ = "user_url_agency_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + user_id = Column(Integer, nullable=False) + is_new = Column(Boolean, nullable=True) + + agency = relationship("Agency", back_populates="user_suggestions") + url = relationship("URL", back_populates="user_agency_suggestions") + + __table_args__ = ( + UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), + ) \ No newline at end of file diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 5e0a9590..11225da2 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -7,6 +7,8 @@ from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.enums import TaskType, URLMetadataAttributeType +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo @@ -15,7 +17,7 @@ from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator -from core.enums import BatchStatus +from core.enums import BatchStatus, SuggestionType from html_tag_collector.DataClassTags import convert_to_response_html_info from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface @@ -157,3 +159,28 @@ async def get_task_info(self, task_id: int) -> TaskInfo: async def get_tasks(self, page: int, task_type: TaskType, task_status: BatchStatus) -> GetTasksResponse: return await self.adb_client.get_tasks(page=page, task_type=task_type, task_status=task_status) + + async def get_next_url_agency_for_annotation( + self, + user_id: int + ) -> GetNextURLForAgencyAnnotationResponse: + return await self.adb_client.get_next_url_agency_for_annotation(user_id=user_id) + + async def submit_url_agency_annotation( + self, + user_id: int, + url_id: int, + agency_post_info: URLAgencyAnnotationPostInfo + ) -> GetNextURLForAgencyAnnotationResponse: + if agency_post_info.suggested_agency == "NEW": + suggestion_type = SuggestionType.NEW_AGENCY + agency_suggestion_id = None + else: + suggestion_type = SuggestionType.MANUAL_SUGGESTION + agency_suggestion_id = agency_post_info.suggested_agency + return await self.adb_client.submit_url_agency_annotation( + user_id=user_id, + url_id=url_id, + suggestion_type=suggestion_type, + agency_suggestion_id=agency_suggestion_id + ) diff --git a/core/DTOs/GetNextURLForAgencyAnnotationResponse.py b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py new file mode 100644 index 00000000..4710275e --- /dev/null +++ b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py @@ -0,0 +1,22 @@ +from typing import Optional, Literal + +from core.enums import SuggestionType +from html_tag_collector.DataClassTags import ResponseHTMLInfo + +class GetNextURLForAgencyAgencyInfo: + suggestion_type: SuggestionType + pdap_agency_id: Optional[int] = None + agency_name: Optional[str] = None + state: Optional[str] = None + county: Optional[str] = None + locality: Optional[str] = None + +class GetNextURLForAgencyAnnotationResponse: + url_id: int + agency_suggestions: list[ + GetNextURLForAgencyAgencyInfo + ] + html_info: ResponseHTMLInfo + +class URLAgencyAnnotationPostInfo: + suggested_agency: int | Literal["NEW"] \ No newline at end of file diff --git a/core/DTOs/URLAgencySuggestionInfo.py b/core/DTOs/URLAgencySuggestionInfo.py index 9729cfb5..2eae0496 100644 --- a/core/DTOs/URLAgencySuggestionInfo.py +++ b/core/DTOs/URLAgencySuggestionInfo.py @@ -13,3 +13,4 @@ class URLAgencySuggestionInfo(BaseModel): state: Optional[str] = None county: Optional[str] = None locality: Optional[str] = None + user_id: Optional[int] = None diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/AgencyIdentificationTaskOperator.py index 2c027a0f..05fac3cc 100644 --- a/core/classes/AgencyIdentificationTaskOperator.py +++ b/core/classes/AgencyIdentificationTaskOperator.py @@ -3,12 +3,14 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType from collector_manager.enums import CollectorType +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask from core.classes.subtasks.CKANAgencyIdentificationSubtask import CKANAgencyIdentificationSubtask from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask from core.classes.subtasks.MuckrockAgencyIdentificationSubtask import MuckrockAgencyIdentificationSubtask +from core.enums import SuggestionType from pdap_api_client.PDAPClient import PDAPClient @@ -61,7 +63,7 @@ async def get_subtask(self, collector_type: CollectorType): ) @staticmethod - async def run_subtask(subtask, url_id, collector_metadata): + async def run_subtask(subtask, url_id, collector_metadata) -> list[URLAgencySuggestionInfo]: return await subtask.run(url_id=url_id, collector_metadata=collector_metadata) async def inner_task_logic(self): @@ -86,7 +88,10 @@ async def inner_task_logic(self): ) error_infos.append(error_info) - await self.adb_client.add_agency_suggestions(all_agency_suggestions) + await self.adb_client.upsert_new_agencies(all_agency_suggestions) + confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] + await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) + await self.adb_client.add_agency_auto_suggestions(all_agency_suggestions) await self.adb_client.add_url_error_infos(error_infos) diff --git a/tests/test_alembic/AlembicRunner.py b/tests/test_alembic/AlembicRunner.py index 51347d55..867c8ba3 100644 --- a/tests/test_alembic/AlembicRunner.py +++ b/tests/test_alembic/AlembicRunner.py @@ -21,6 +21,7 @@ def reflect(self): def upgrade(self, revision: str): command.upgrade(self.alembic_config, revision) + self.reflect() def downgrade(self, revision: str): print("Downgrading...") @@ -33,3 +34,9 @@ def reset_schema(self): self.connection.exec_driver_sql("DROP SCHEMA public CASCADE;") self.connection.exec_driver_sql("CREATE SCHEMA public;") self.connection.commit() + + def table_exists(self, table_name: str) -> bool: + return table_name in self.inspector.get_table_names() + + def tables_exist(self, table_names: list[str]) -> bool: + return all(table_name in self.inspector.get_table_names() for table_name in table_names) diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 51096ec5..aa3deb87 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -359,3 +359,19 @@ def column_check() -> bool: alembic_runner.reflect() alembic_runner.upgrade("8c44e02733ae") assert column_check() + +def test_revise_agency_suggestions(alembic_runner): + + tables_to_check = [ + "user_url_agency_suggestions", + "automated_url_agency_suggestions", + "agencies", + "confirmed_url_agency" + ] + + alembic_runner.upgrade("8c44e02733ae") + assert alembic_runner.table_exists("url_agency_suggestions") + assert not alembic_runner.tables_exist(tables_to_check) + alembic_runner.upgrade("d7eb670edaf0") + assert not alembic_runner.table_exists("url_agency_suggestions") + assert alembic_runner.tables_exist(tables_to_check) \ No newline at end of file diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 462f3b24..30cf6f27 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -14,15 +14,16 @@ import sqlalchemy as sa from sqlalchemy import create_engine from sqlalchemy.dialects import postgresql -from sqlalchemy.exc import DataError +from sqlalchemy.exc import DataError, DBAPIError from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.enums import URLHTMLContentType from collector_db.helper_functions import get_postgres_connection_string -from collector_db.models import Base +from collector_db.models import Base, Agency from collector_manager.enums import CollectorType, URLStatus -from core.enums import BatchStatus -from tests.helpers.DBDataCreator import DBDataCreator +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.enums import BatchStatus, SuggestionType +from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo from util.helper_functions import get_enum_values SATypes: TypeAlias = sa.Integer or sa.String or postgresql.ENUM or sa.TIMESTAMP or sa.Text @@ -333,10 +334,77 @@ async def test_url_agency_suggestions_trigger(db_data_creator: DBDataCreator): # The trigger checks that the corresponding entry `url_agency_suggestions` has value 'Manual Suggestion' # And raises an error if not dbdc = db_data_creator - await dbdc.batch_and_urls( + creation_info: BatchURLCreationInfo = await dbdc.batch_and_urls( with_html_content=True ) - # Insert + # Insert agency suggestion + suggestion_info = URLAgencySuggestionInfo( + url_id=creation_info.url_ids[0], + suggestion_type=SuggestionType.MANUAL_SUGGESTION, + pdap_agency_id=1, + agency_name="Test Agency", + state="Test State", + county="Test County", + locality="Test Locality", + user_id=None + ) + + adb_client = dbdc.adb_client + + # Without the User ID, should fail + with pytest.raises(DBAPIError): + await adb_client.add_agency_suggestions([suggestion_info]) + + # With the User ID, should succeed + suggestion_info.user_id = 1 + await adb_client.add_agency_suggestions([suggestion_info]) + +@pytest.mark.asyncio +async def test_upset_new_agencies(db_data_creator: DBDataCreator): + """ + Check that if the agency doesn't exist, it is added + But if the agency does exist, it is updated with new information + """ + + suggestions = [] + for i in range(3): + suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=i, + agency_name=f"Test Agency {i}", + state=f"Test State {i}", + county=f"Test County {i}", + locality=f"Test Locality {i}", + user_id=1 + ) + suggestions.append(suggestion) + + adb_client = db_data_creator.adb_client + await adb_client.upsert_new_agencies(suggestions) + + update_suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=0, + agency_name="Updated Test Agency", + state="Updated Test State", + county="Updated Test County", + locality="Updated Test Locality", + user_id=1 + ) + + await adb_client.upsert_new_agencies([update_suggestion]) + + rows = await adb_client.get_all(Agency) + + assert len(rows) == 3 + + d = {} + for row in rows: + d[row.agency_id] = row.name + assert d[0] == "Updated Test Agency" + assert d[1] == "Test Agency 1" + assert d[2] == "Test Agency 2" - pytest.fail("Not implemented") \ No newline at end of file diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 94f6c1d3..2ee9ec45 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -6,6 +6,7 @@ from aiohttp import ClientSession from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse +from collector_db.models import ConfirmedUrlAgency, Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator @@ -14,12 +15,38 @@ from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask from core.classes.subtasks.MuckrockAgencyIdentificationSubtask import MuckrockAgencyIdentificationSubtask from core.enums import SuggestionType -from helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo from pdap_api_client.AccessManager import AccessManager from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo from pdap_api_client.PDAPClient import PDAPClient from pdap_api_client.enums import MatchAgencyResponseStatus - +from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo + +sample_agency_suggestions = [ + URLAgencySuggestionInfo( + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=None, + agency_name=None, + state=None, + county=None, + locality=None + ), + URLAgencySuggestionInfo( + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=1, + agency_name="Test Agency", + state="Test State", + county="Test County", + locality="Test Locality" + ), + URLAgencySuggestionInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=2, + agency_name="Test Agency 2", + state="Test State 2", + county="Test County 2", + locality="Test Locality 2" + ) +] @pytest.mark.asyncio async def test_agency_preannotation_task(db_data_creator: DBDataCreator): @@ -28,17 +55,10 @@ async def mock_run_subtask( url_id: int, collector_metadata: Optional[dict] ): - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ) - ] + val = url_id % 3 + suggestion = sample_agency_suggestions[val] + suggestion.url_id = url_id + return [suggestion] async with ClientSession() as session: mock = MagicMock() @@ -121,6 +141,24 @@ async def mock_run_subtask( assert mock.call_count == 6 + + # Check confirmed and auto suggestions + adb_client = db_data_creator.adb_client + confirmed_suggestions = await adb_client.get_all(ConfirmedUrlAgency) + assert len(confirmed_suggestions) == 2 + + agencies = await adb_client.get_all(Agency) + assert len(agencies) == 2 + + auto_suggestions = await adb_client.get_all(AutomatedUrlAgencySuggestion) + assert len(auto_suggestions) == 4 + + # Of the auto suggestions, 2 should be unknown + assert len([s for s in auto_suggestions if s.is_unknown]) == 2 + + # Of the auto suggestions, 2 should not be unknown + assert len([s for s in auto_suggestions if not s.is_unknown]) == 2 + @pytest.mark.asyncio async def test_common_crawler_subtask(db_data_creator: DBDataCreator): # Test that common_crawler subtask correctly adds URL to From 412ad7e78654dc36d29d647b32b8c80b21c2a54f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Feb 2025 19:12:11 -0500 Subject: [PATCH 047/182] Revise TaskOperator logic --- ...daf0_revise_agency_identification_logic.py | 25 ++- collector_db/AsyncDatabaseClient.py | 1 + collector_db/DTOs/InsertURLsInfo.py | 1 + collector_db/DatabaseClient.py | 1 + collector_db/models.py | 2 +- core/AsyncCore.py | 96 ++++++++---- core/DTOs/TaskOperatorRunInfo.py | 14 ++ .../AgencyIdentificationTaskOperator.py | 6 +- core/classes/TaskOperatorBase.py | 37 +++-- tests/helpers/DBDataCreator.py | 5 +- tests/test_alembic/AlembicRunner.py | 9 +- tests/test_alembic/test_revisions.py | 136 ++++++++--------- .../collector_db/test_database_structure.py | 30 ---- .../integration/core/test_async_core.py | 144 ++++++++++++++++++ .../tasks/test_agency_preannotation_task.py | 33 ++-- .../integration/tasks/test_example_task.py | 44 +----- .../integration/tasks/test_url_html_task.py | 20 +-- .../tasks/test_url_record_type_task.py | 16 +- .../test_url_relevancy_huggingface_task.py | 4 +- 19 files changed, 411 insertions(+), 213 deletions(-) create mode 100644 core/DTOs/TaskOperatorRunInfo.py create mode 100644 tests/test_automated/integration/core/test_async_core.py diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index ebe6cdf5..4db28b9d 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -53,13 +53,30 @@ def upgrade(): op.create_table( "automated_url_agency_suggestions", sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), - sa.Column("agency_id", sa.Integer(), sa.ForeignKey("agencies.agency_id"), nullable=False), + sa.Column("agency_id", sa.Integer(), sa.ForeignKey("agencies.agency_id"), nullable=True), sa.Column("url_id", sa.Integer(), sa.ForeignKey("urls.id"), nullable=False), sa.Column("is_unknown", sa.Boolean(), nullable=True), ) op.create_unique_constraint( "uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", ["agency_id", "url_id"] ) + op.execute(""" + CREATE OR REPLACE FUNCTION enforce_no_agency_id_if_unknown() + RETURNS TRIGGER AS $$ + BEGIN + IF NEW.is_unknown = TRUE AND NEW.agency_id IS NOT NULL THEN + RAISE EXCEPTION 'agency_id must be null when is_unknown is TRUE'; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + op.execute(""" + CREATE TRIGGER enforce_no_agency_id_if_unknown + BEFORE INSERT ON automated_url_agency_suggestions + FOR EACH ROW + EXECUTE FUNCTION enforce_no_agency_id_if_unknown(); + """) # Create user_url_agency_suggestions table op.create_table( @@ -104,4 +121,10 @@ def downgrade(): sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ), sa.PrimaryKeyConstraint('id') ) + op.execute(""" + DROP TRIGGER IF EXISTS enforce_no_agency_id_if_unknown ON automated_url_agency_suggestions; + """) + op.execute(""" + DROP FUNCTION IF EXISTS enforce_no_agency_id_if_unknown; + """) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index dbc2d3dc..d89ffc10 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -629,6 +629,7 @@ async def upsert_new_agencies( ) await session.merge(agency) + @session_manager async def add_confirmed_agency_url_links( self, session: AsyncSession, diff --git a/collector_db/DTOs/InsertURLsInfo.py b/collector_db/DTOs/InsertURLsInfo.py index 079510d2..da2ee39a 100644 --- a/collector_db/DTOs/InsertURLsInfo.py +++ b/collector_db/DTOs/InsertURLsInfo.py @@ -5,6 +5,7 @@ class InsertURLsInfo(BaseModel): url_mappings: list[URLMapping] + url_ids: list[int] total_count: int = 0 original_count: int = 0 duplicate_count: int = 0 diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 2a659f3f..372cca8e 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -125,6 +125,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo total_count=len(url_infos), original_count=len(url_mappings), duplicate_count=len(duplicates), + url_ids=[url_mapping.url_id for url_mapping in url_mappings] ) @session_manager diff --git a/collector_db/models.py b/collector_db/models.py index 064cf660..ee43f35b 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -344,7 +344,7 @@ class AutomatedUrlAgencySuggestion(Base): __tablename__ = "automated_url_agency_suggestions" id = Column(Integer, primary_key=True, autoincrement=True) - agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) + agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=True) url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) is_unknown = Column(Boolean, nullable=True) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 11225da2..2ec17da5 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -13,7 +13,9 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo +from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator @@ -44,55 +46,89 @@ def __init__( self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) - async def run_url_html_task(self): + async def get_url_html_task_operator(self): self.logger.info("Running URL HTML Task") operator = URLHTMLTaskOperator( adb_client=self.adb_client, url_request_interface=self.url_request_interface, html_parser=self.html_parser ) - await operator.run_task() + return operator - async def run_url_relevance_huggingface_task(self): + async def get_url_relevance_huggingface_task_operator(self): self.logger.info("Running URL Relevance Huggingface Task") operator = URLRelevanceHuggingfaceTaskOperator( adb_client=self.adb_client, huggingface_interface=self.huggingface_interface ) - await operator.run_task() + return operator - async def run_url_record_type_task(self): - self.logger.info("Running URL Record Type Task") + async def get_url_record_type_task_operator(self): operator = URLRecordTypeTaskOperator( adb_client=self.adb_client, classifier=OpenAIRecordClassifier() ) - await operator.run_task() - - async def run_agency_identification_task(self): - self.logger.info("Running Agency Identification Task") - async with ClientSession() as session: - pdap_client = PDAPClient( - access_manager=AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), - api_key=get_from_env("PDAP_API_KEY"), - session=session - ), - ) - muckrock_api_interface = MuckrockAPIInterface(session=session) - operator = AgencyIdentificationTaskOperator( - adb_client=self.adb_client, - pdap_client=pdap_client, - muckrock_api_interface=muckrock_api_interface - ) - await operator.run_task() + return operator + + async def get_agency_identification_task_operator(self): + session = ClientSession() + pdap_client = PDAPClient( + access_manager=AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY"), + session=session + ), + ) + muckrock_api_interface = MuckrockAPIInterface(session=session) + operator = AgencyIdentificationTaskOperator( + adb_client=self.adb_client, + pdap_client=pdap_client, + muckrock_api_interface=muckrock_api_interface + ) + return operator + + async def get_task_operators(self) -> list[TaskOperatorBase]: + return [ + await self.get_url_html_task_operator(), + await self.get_url_relevance_huggingface_task_operator(), + await self.get_url_record_type_task_operator(), + await self.get_agency_identification_task_operator() + ] async def run_tasks(self): - await self.run_url_html_task() - await self.run_url_relevance_huggingface_task() - await self.run_url_record_type_task() - await self.run_agency_identification_task() + operators = await self.get_task_operators() + for operator in operators: + meets_prereq = await operator.meets_task_prerequisites() + if not meets_prereq: + self.logger.info(f"Skipping {operator.task_type.value} Task") + continue + task_id = await self.initiate_task_in_db(task_type=operator.task_type) + run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + await self.conclude_task(run_info) + + async def conclude_task(self, run_info): + await self.adb_client.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) + await self.handle_outcome(run_info) + + async def initiate_task_in_db(self, task_type: TaskType) -> int: + self.logger.info(f"Initiating {task_type.value} Task") + task_id = await self.adb_client.initiate_task(task_type=task_type) + return task_id + + async def handle_outcome(self, run_info: TaskOperatorRunInfo): + match run_info.outcome: + case TaskOperatorOutcome.ERROR: + await self.handle_task_error(run_info) + case TaskOperatorOutcome.SUCCESS: + await self.adb_client.update_task_status( + task_id=run_info.task_id, + status=BatchStatus.COMPLETE + ) + + async def handle_task_error(self, run_info: TaskOperatorRunInfo): + await self.adb_client.update_task_status(task_id=run_info.task_id, status=BatchStatus.ERROR) + await self.adb_client.add_task_error(task_id=run_info.task_id, error=run_info.message) async def convert_to_annotation_request_info(self, url_info: URLAnnotationInfo) -> AnnotationRequestInfo: response_html_info = convert_to_response_html_info( diff --git a/core/DTOs/TaskOperatorRunInfo.py b/core/DTOs/TaskOperatorRunInfo.py new file mode 100644 index 00000000..6b5c29e0 --- /dev/null +++ b/core/DTOs/TaskOperatorRunInfo.py @@ -0,0 +1,14 @@ +from enum import Enum +from typing import Optional + +from pydantic import BaseModel + +class TaskOperatorOutcome(Enum): + SUCCESS = "success" + ERROR = "error" + +class TaskOperatorRunInfo(BaseModel): + task_id: Optional[int] + linked_url_ids: list[int] + outcome: TaskOperatorOutcome + message: str = "" \ No newline at end of file diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/AgencyIdentificationTaskOperator.py index 05fac3cc..de27f6cb 100644 --- a/core/classes/AgencyIdentificationTaskOperator.py +++ b/core/classes/AgencyIdentificationTaskOperator.py @@ -88,10 +88,12 @@ async def inner_task_logic(self): ) error_infos.append(error_info) - await self.adb_client.upsert_new_agencies(all_agency_suggestions) + non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN] + await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - await self.adb_client.add_agency_auto_suggestions(all_agency_suggestions) + non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED] + await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) await self.adb_client.add_url_error_infos(error_infos) diff --git a/core/classes/TaskOperatorBase.py b/core/classes/TaskOperatorBase.py index 7998713c..ece3bc81 100644 --- a/core/classes/TaskOperatorBase.py +++ b/core/classes/TaskOperatorBase.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome, TaskOperatorRunInfo from core.enums import BatchStatus @@ -11,6 +12,7 @@ def __init__(self, adb_client: AsyncDatabaseClient): self.adb_client = adb_client self.task_id = None self.tasks_linked = False + self.linked_url_ids = [] @property @abstractmethod @@ -26,8 +28,7 @@ async def meets_task_prerequisites(self): raise NotImplementedError async def link_urls_to_task(self, url_ids: list[int]): - await self.adb_client.link_urls_to_task(task_id=self.task_id, url_ids=url_ids) - self.tasks_linked = True + self.linked_url_ids = url_ids async def initiate_task_in_db(self) -> int: task_id = await self.adb_client.initiate_task( @@ -35,22 +36,32 @@ async def initiate_task_in_db(self) -> int: ) return task_id - async def conclude_task_in_db(self): - if not self.tasks_linked: + async def conclude_task(self): + if not self.linked_url_ids: raise Exception("Task has not been linked to any URLs") - await self.adb_client.update_task_status(task_id=self.task_id, status=BatchStatus.COMPLETE) - - async def run_task(self): - if not await self.meets_task_prerequisites(): - print(f"Task {self.task_type.value} does not meet prerequisites. Skipping...") - return - self.task_id = await self.initiate_task_in_db() + return await self.run_info( + outcome=TaskOperatorOutcome.SUCCESS, + message="Task completed successfully" + ) + async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + self.task_id = task_id try: await self.inner_task_logic() - await self.conclude_task_in_db() + return await self.conclude_task() except Exception as e: - await self.handle_task_error(e) + return await self.run_info( + outcome=TaskOperatorOutcome.ERROR, + message=str(e) + ) + + async def run_info(self, outcome: TaskOperatorOutcome, message: str): + return TaskOperatorRunInfo( + task_id=self.task_id, + linked_url_ids=self.linked_url_ids, + outcome=outcome, + message=message + ) @abstractmethod async def inner_task_logic(self): diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index c9a6b31a..2d6b603f 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -72,7 +72,10 @@ def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: ) ) - return self.db_client.insert_urls(url_infos=url_infos, batch_id=batch_id) + return self.db_client.insert_urls( + url_infos=url_infos, + batch_id=batch_id, + ) def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): """ diff --git a/tests/test_alembic/AlembicRunner.py b/tests/test_alembic/AlembicRunner.py index 867c8ba3..cb435d5a 100644 --- a/tests/test_alembic/AlembicRunner.py +++ b/tests/test_alembic/AlembicRunner.py @@ -2,7 +2,7 @@ from alembic import command from alembic.config import Config -from sqlalchemy import Connection, Inspector, MetaData, inspect +from sqlalchemy import Connection, Inspector, MetaData, inspect, text from sqlalchemy.orm import scoped_session @@ -40,3 +40,10 @@ def table_exists(self, table_name: str) -> bool: def tables_exist(self, table_names: list[str]) -> bool: return all(table_name in self.inspector.get_table_names() for table_name in table_names) + + def execute(self, sql: str): + result = self.connection.execute(text(sql)) + if result.cursor is not None: + results = result.fetchall() + self.connection.commit() + return results diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index aa3deb87..343890df 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -164,39 +164,38 @@ def test_convert_batch_strategy_status_to_enum(alembic_runner): "aborted" ] d = {} - with alembic_runner.session() as session: - for strategy, status in product(existing_strategy_strings, existing_status_strings): - # Execute inserts and store each ID - id_ = session.execute(text( - f""" - INSERT INTO BATCHES - (strategy, user_id, status, total_url_count, original_url_count, duplicate_url_count) - VALUES( - '{strategy}', - 1, - '{status}', - 0, - 0, - 0 - ) - RETURNING ID; - """ - )).scalar() - d[id_] = [strategy, status] - session.commit() + for strategy, status in product(existing_strategy_strings, existing_status_strings): + # Execute inserts and store each ID + query = f""" + INSERT INTO BATCHES + (strategy, user_id, status, total_url_count, original_url_count, duplicate_url_count) + VALUES( + '{strategy}', + 1, + '{status}', + 0, + 0, + 0 + ) + RETURNING ID; + """ + + id_ = alembic_runner.execute(query)[0][0] + d[id_] = [strategy, status] alembic_runner.upgrade('db6d60feda7d') - with alembic_runner.session() as session: - # Assert all strategies and statuses remain the same - for id_ in d.keys(): - strategy, status = d[id_] - result = session.execute(text( - f""" - SELECT strategy, status FROM BATCHES WHERE id = {id_}; - """ - )).fetchone() - assert result[0] == strategy - assert result [1] == status + + # Assert all strategies and statuses remain the same + for id_ in d.keys(): + strategy, status = d[id_] + + result = alembic_runner.execute( + f""" + SELECT strategy, status FROM BATCHES WHERE id = {id_}; + """ + )[0] + assert result[0] == strategy + assert result [1] == status def test_convert_url_outcome_to_enum(alembic_runner): @@ -209,50 +208,49 @@ def test_convert_url_outcome_to_enum(alembic_runner): 'duplicate', ] d = {} - with alembic_runner.session() as session: - batch_id = session.execute(text( - """INSERT INTO BATCHES - (strategy, user_id, status, total_url_count, original_url_count, duplicate_url_count) - VALUES( - 'ckan', - 1, - 'in-process', - 0, - 0, - 0 - ) - RETURNING ID; + # with alembic_runner.session() as session: + batch_id = alembic_runner.execute( + """INSERT INTO BATCHES + (strategy, user_id, status, total_url_count, original_url_count, duplicate_url_count) + VALUES( + 'ckan', + 1, + 'in-process', + 0, + 0, + 0 + ) + RETURNING ID; + """ + )[0][0] + + + for outcome in existing_outcome_strings: + id_ = alembic_runner.execute( + f""" + INSERT INTO URLS + (batch_id, url, collector_metadata, outcome) + VALUES ( + '{batch_id}', + 'https://example.com/{outcome}', + '{{}}', + '{outcome}' + ) + RETURNING ID; """ - )).scalar() - - for outcome in existing_outcome_strings: - id_ = session.execute(text( - f""" - INSERT INTO URLS - (batch_id, url, collector_metadata, outcome) - VALUES ( - '{batch_id}', - 'https://example.com/{outcome}', - '{{}}', - '{outcome}' - ) - RETURNING ID; - """ - )).scalar() - d[id_] = outcome - session.commit() + )[0][0] + d[id_] = outcome alembic_runner.upgrade('e27c5f8409a3') - with alembic_runner.session() as session: - for id_ in d.keys(): - outcome = d[id_] + for id_ in d.keys(): + outcome = d[id_] - result = session.execute(text( - f"""SELECT OUTCOME FROM URLS WHERE ID = {id_};""" - )).scalar() + result = alembic_runner.execute( + f"""SELECT OUTCOME FROM URLS WHERE ID = {id_};""" + )[0][0] - assert result == outcome + assert result == outcome def test_create_htmlcontent_and_rooturl_tables(alembic_runner): alembic_runner.upgrade('e27c5f8409a3') diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 30cf6f27..cdf93801 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -328,36 +328,6 @@ def test_root_url(db_data_creator: DBDataCreator): table_tester.run_column_tests() -@pytest.mark.asyncio -async def test_url_agency_suggestions_trigger(db_data_creator: DBDataCreator): - # Check that if an entry is added to the user_url_agency_suggestions table, - # The trigger checks that the corresponding entry `url_agency_suggestions` has value 'Manual Suggestion' - # And raises an error if not - dbdc = db_data_creator - creation_info: BatchURLCreationInfo = await dbdc.batch_and_urls( - with_html_content=True - ) - # Insert agency suggestion - suggestion_info = URLAgencySuggestionInfo( - url_id=creation_info.url_ids[0], - suggestion_type=SuggestionType.MANUAL_SUGGESTION, - pdap_agency_id=1, - agency_name="Test Agency", - state="Test State", - county="Test County", - locality="Test Locality", - user_id=None - ) - - adb_client = dbdc.adb_client - - # Without the User ID, should fail - with pytest.raises(DBAPIError): - await adb_client.add_agency_suggestions([suggestion_info]) - - # With the User ID, should succeed - suggestion_info.user_id = 1 - await adb_client.add_agency_suggestions([suggestion_info]) @pytest.mark.asyncio async def test_upset_new_agencies(db_data_creator: DBDataCreator): diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py new file mode 100644 index 00000000..3b99d15c --- /dev/null +++ b/tests/test_automated/integration/core/test_async_core.py @@ -0,0 +1,144 @@ +import types +from unittest.mock import MagicMock, AsyncMock + +import pytest + +from collector_db.enums import TaskType +from collector_db.models import Task +from core.AsyncCore import AsyncCore +from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome +from core.enums import BatchStatus +from helpers.DBDataCreator import DBDataCreator + +@pytest.mark.asyncio +async def test_conclude_task_success(db_data_creator: DBDataCreator): + ddc = db_data_creator + + batch_id = ddc.batch() + url_ids = ddc.urls(batch_id=batch_id, url_count=3).url_ids + task_id = await ddc.task() + run_info = TaskOperatorRunInfo( + task_id=task_id, + linked_url_ids=url_ids, + outcome=TaskOperatorOutcome.SUCCESS, + ) + + core = AsyncCore( + adb_client=ddc.adb_client, + huggingface_interface=MagicMock(), + url_request_interface=MagicMock(), + html_parser=MagicMock() + ) + await core.conclude_task(run_info=run_info) + + task_info = await ddc.adb_client.get_task_info(task_id=task_id) + + assert task_info.task_status == BatchStatus.COMPLETE + assert len(task_info.urls) == 3 + +@pytest.mark.asyncio +async def test_conclude_task_success(db_data_creator: DBDataCreator): + ddc = db_data_creator + + batch_id = ddc.batch() + url_ids = ddc.urls(batch_id=batch_id, url_count=3).url_ids + task_id = await ddc.task() + run_info = TaskOperatorRunInfo( + task_id=task_id, + linked_url_ids=url_ids, + outcome=TaskOperatorOutcome.SUCCESS, + ) + + core = AsyncCore( + adb_client=ddc.adb_client, + huggingface_interface=MagicMock(), + url_request_interface=MagicMock(), + html_parser=MagicMock() + ) + await core.conclude_task(run_info=run_info) + + task_info = await ddc.adb_client.get_task_info(task_id=task_id) + + assert task_info.task_status == BatchStatus.COMPLETE + assert len(task_info.urls) == 3 + +@pytest.mark.asyncio +async def test_conclude_task_error(db_data_creator: DBDataCreator): + ddc = db_data_creator + + batch_id = ddc.batch() + url_ids = ddc.urls(batch_id=batch_id, url_count=3).url_ids + task_id = await ddc.task() + run_info = TaskOperatorRunInfo( + task_id=task_id, + linked_url_ids=url_ids, + outcome=TaskOperatorOutcome.ERROR, + message="test error", + ) + + core = AsyncCore( + adb_client=ddc.adb_client, + huggingface_interface=MagicMock(), + url_request_interface=MagicMock(), + html_parser=MagicMock() + ) + await core.conclude_task(run_info=run_info) + + task_info = await ddc.adb_client.get_task_info(task_id=task_id) + + assert task_info.task_status == BatchStatus.ERROR + assert task_info.error_info == "test error" + assert len(task_info.urls) == 3 + +@pytest.mark.asyncio +async def test_run_task_prereq_not_met(): + core = AsyncCore( + adb_client=AsyncMock(), + huggingface_interface=AsyncMock(), + url_request_interface=AsyncMock(), + html_parser=AsyncMock() + ) + + mock_operator = AsyncMock() + mock_operator.meets_task_prerequisites = AsyncMock(return_value=False) + AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) + await core.run_tasks() + + mock_operator.meets_task_prerequisites.assert_called_once() + mock_operator.run_task.assert_not_called() + +@pytest.mark.asyncio +async def test_run_task_prereq_met(db_data_creator: DBDataCreator): + + async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + return TaskOperatorRunInfo( + task_id=task_id, + outcome=TaskOperatorOutcome.SUCCESS, + linked_url_ids=[1, 2, 3] + ) + + core = AsyncCore( + adb_client=db_data_creator.adb_client, + huggingface_interface=AsyncMock(), + url_request_interface=AsyncMock(), + html_parser=AsyncMock() + ) + core.conclude_task = AsyncMock() + + mock_operator = AsyncMock() + mock_operator.meets_task_prerequisites = AsyncMock(return_value=True) + mock_operator.task_type = TaskType.HTML + mock_operator.run_task = types.MethodType(run_task, mock_operator) + + AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) + await core.run_tasks() + + mock_operator.meets_task_prerequisites.assert_called_once() + + results = await db_data_creator.adb_client.get_all(Task) + + assert len(results) == 1 + assert results[0].task_status == BatchStatus.IN_PROCESS.value + + core.conclude_task.assert_called_once() + diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 2ee9ec45..c8df809c 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -1,4 +1,5 @@ import types +from copy import deepcopy from typing import Optional from unittest.mock import MagicMock, AsyncMock, patch @@ -8,6 +9,7 @@ from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse from collector_db.models import ConfirmedUrlAgency, Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask @@ -23,6 +25,7 @@ sample_agency_suggestions = [ URLAgencySuggestionInfo( + url_id=-1, # This will be overwritten suggestion_type=SuggestionType.UNKNOWN, pdap_agency_id=None, agency_name=None, @@ -31,16 +34,18 @@ locality=None ), URLAgencySuggestionInfo( + url_id=-1, # This will be overwritten suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=1, + pdap_agency_id=-1, agency_name="Test Agency", state="Test State", county="Test County", locality="Test Locality" ), URLAgencySuggestionInfo( + url_id=-1, # This will be overwritten suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=2, + pdap_agency_id=-1, agency_name="Test Agency 2", state="Test State 2", county="Test County 2", @@ -55,9 +60,10 @@ async def mock_run_subtask( url_id: int, collector_metadata: Optional[dict] ): - val = url_id % 3 - suggestion = sample_agency_suggestions[val] + # Deepcopy to prevent using the same instance in memory + suggestion = deepcopy(sample_agency_suggestions[url_id % 3]) suggestion.url_id = url_id + suggestion.pdap_agency_id = (url_id % 3) if suggestion.suggestion_type != SuggestionType.UNKNOWN else None return [suggestion] async with ClientSession() as session: @@ -83,9 +89,9 @@ async def mock_run_subtask( muckrock_api_interface=muckrock_api_interface ) - # Try to run initially and confirm it doesn't run - # due to not meeting prerequisites - await operator.run_task() + # Confirm does not yet meet prerequisites + assert not await operator.meets_task_prerequisites() + d = {} @@ -101,8 +107,12 @@ async def mock_run_subtask( creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(strategy=strategy, url_count=1, with_html_content=True) d[strategy] = creation_info.url_ids[0] + + # Confirm meets prerequisites + assert await operator.meets_task_prerequisites() # Run task - await operator.run_task() + run_info = await operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message # Confirm tasks are piped into the correct subtasks # * common_crawler into common_crawler_subtask @@ -136,10 +146,11 @@ async def mock_run_subtask( url_id = d[collector_type] assert d2[url_id] == subtask_class - # Run task again and confirm it doesn't call any additional subtasks - await operator.run_task() - assert mock.call_count == 6 + # Confirm task again does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Check confirmed and auto suggestions diff --git a/tests/test_automated/integration/tasks/test_example_task.py b/tests/test_automated/integration/tasks/test_example_task.py index f6f56521..819d0dc0 100644 --- a/tests/test_automated/integration/tasks/test_example_task.py +++ b/tests/test_automated/integration/tasks/test_example_task.py @@ -3,6 +3,7 @@ import pytest from collector_db.enums import TaskType +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.TaskOperatorBase import TaskOperatorBase from core.enums import BatchStatus from tests.helpers.DBDataCreator import DBDataCreator @@ -31,32 +32,15 @@ async def test_example_task_success(db_data_creator: DBDataCreator): async def mock_inner_task_logic(self): # Add link to 3 urls - await self.adb_client.link_urls_to_task(task_id=self.task_id, url_ids=url_ids) - self.tasks_linked = True + self.linked_url_ids = url_ids operator = ExampleTaskOperator(adb_client=db_data_creator.adb_client) operator.inner_task_logic = types.MethodType(mock_inner_task_logic, operator) - await operator.run_task() + run_info = await operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS + assert run_info.linked_url_ids == url_ids - # Get Task Info - task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) - - # Check that 3 urls were linked to the task - assert len(task_info.urls) == 3 - - # Check that error info is empty - assert task_info.error_info is None - - # Check that the task was marked as complete - assert task_info.task_status == BatchStatus.COMPLETE - - # Check that the task type is HTML - assert task_info.task_type == TaskType.HTML - - - # Check that updated_at is not null - assert task_info.updated_at is not None @pytest.mark.asyncio async def test_example_task_failure(db_data_creator: DBDataCreator): @@ -66,22 +50,8 @@ def mock_inner_task_logic(self): raise ValueError("test error") operator.inner_task_logic = types.MethodType(mock_inner_task_logic, operator) - await operator.run_task() - - # Get Task Info - task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) - - # Check that there are no URLs associated - assert len(task_info.urls) == 0 - - # Check that the task was marked as errored - assert task_info.task_status == BatchStatus.ERROR - - # Check that the task type is HTML - assert task_info.task_type == TaskType.HTML - - # Check error - assert "test error" in task_info.error_info + run_info = await operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.ERROR diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py index 7674113f..75c46855 100644 --- a/tests/test_automated/integration/tasks/test_url_html_task.py +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.enums import BatchStatus from tests.helpers.DBDataCreator import DBDataCreator @@ -65,20 +66,22 @@ async def mock_get_from_cache(self, url: str) -> Optional[str]: url_request_interface=url_request_interface, html_parser=html_parser ) - await operator.run_task() - # Check that, because no URLs were created, the task did not run - await assert_database_has_no_tasks(db_data_creator.adb_client) + meets_prereqs = await operator.meets_task_prerequisites() + # Check that, because no URLs were created, the prereqs are not met + assert not meets_prereqs batch_id = db_data_creator.batch() url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings url_ids = [url_info.url_id for url_info in url_mappings] - await operator.run_task() + task_id = await db_data_creator.adb_client.initiate_task(task_type=TaskType.HTML) + run_info = await operator.run_task(task_id) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS + assert run_info.linked_url_ids == url_ids # Check in database that - # - task is listed as complete # - task type is listed as 'HTML' # - task has 3 urls # - task has one errored url with error "ValueError" @@ -87,18 +90,17 @@ async def mock_get_from_cache(self, url: str) -> Optional[str]: ) assert task_info.error_info is None - assert task_info.task_status == BatchStatus.COMPLETE assert task_info.task_type == TaskType.HTML - assert len(task_info.urls) == 3 assert len(task_info.url_errors) == 1 assert task_info.url_errors[0].error == "test error" adb = db_data_creator.adb_client # Check that both success urls have two rows of HTML data - hci = await adb.get_html_content_info(url_id=task_info.urls[0].id) + await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) + hci = await adb.get_html_content_info(url_id=url_ids[0]) assert len(hci) == 2 - hci = await adb.get_html_content_info(url_id=task_info.urls[1].id) + hci = await adb.get_html_content_info(url_id=url_ids[1]) assert len(hci) == 2 # Check that errored url has error info diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py index ee624dae..cf4c8e0e 100644 --- a/tests/test_automated/integration/tasks/test_url_record_type_task.py +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -4,6 +4,7 @@ from collector_db.enums import TaskType from collector_db.models import URLMetadata +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.enums import RecordType, BatchStatus from tests.helpers.DBDataCreator import DBDataCreator @@ -21,29 +22,32 @@ async def test_url_record_type_task(db_data_creator: DBDataCreator): adb_client=db_data_creator.adb_client, classifier=mock_classifier ) - await operator.run_task() - # No task should have been created due to not meeting prerequisites - await assert_database_has_no_tasks(db_data_creator.adb_client) + # Should not meet prerequisites + meets_prereqs = await operator.meets_task_prerequisites() + assert not meets_prereqs batch_id = db_data_creator.batch() iui = db_data_creator.urls(batch_id=batch_id, url_count=2) url_ids = [iui.url_mappings[0].url_id, iui.url_mappings[1].url_id] await db_data_creator.html_data(url_ids) - await operator.run_task() + assert await operator.meets_task_prerequisites() + task_id = await db_data_creator.adb_client.initiate_task(task_type=TaskType.RECORD_TYPE) + + run_info = await operator.run_task(task_id) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS # Task should have been created task_info = await db_data_creator.adb_client.get_task_info(task_id=operator.task_id) assert task_info.error_info is None - assert task_info.task_status == BatchStatus.COMPLETE response = await db_data_creator.adb_client.get_tasks() tasks = response.tasks assert len(tasks) == 1 task = tasks[0] assert task.type == TaskType.RECORD_TYPE - assert task.url_count == 2 + assert run_info.linked_url_ids == url_ids assert task.url_error_count == 1 # Get metadata diff --git a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index abf86cda..188621b7 100644 --- a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -39,7 +39,7 @@ def mock_get_url_relevancy( adb_client=AsyncDatabaseClient(), huggingface_interface=mock_hf_interface ) - await task_operator.run_task() + await task_operator.run_task(1) await assert_database_has_no_tasks(db_data_creator.adb_client) @@ -49,7 +49,7 @@ def mock_get_url_relevancy( await db_data_creator.html_data(url_ids) await db_data_creator.metadata([url_ids[0]]) - await task_operator.run_task() + await task_operator.run_task(1) results = await db_data_creator.adb_client.get_all(URLMetadata) From 28df49dadf9154c39ff7abebf10f8eb924b40e15 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 10 Feb 2025 09:34:20 -0500 Subject: [PATCH 048/182] Revise agency identification annotation logic --- Dockerfile | 3 +- agency_identifier/MuckrockAPIInterface.py | 2 +- alembic/env.py | 12 +- ...daf0_revise_agency_identification_logic.py | 23 +- api/routes/annotate.py | 2 + collector_db/AsyncDatabaseClient.py | 127 +++++++++- core/AsyncCore.py | 18 +- .../GetNextURLForAgencyAnnotationResponse.py | 15 +- .../AgencyIdentificationTaskOperator.py | 59 ++--- pdap_api_client/AccessManager.py | 3 +- tests/helpers/DBDataCreator.py | 70 +++++- .../integration/api/conftest.py | 3 + .../api/helpers/RequestValidator.py | 19 ++ .../integration/api/test_annotate.py | 225 +++++++++++++++++- .../collector_db/test_database_structure.py | 3 +- 15 files changed, 532 insertions(+), 52 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6cf1d6a2..c93fe158 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,7 @@ COPY alembic.ini ./alembic.ini COPY alembic ./alembic COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager +COPY pdap_api_client ./pdap_api_client COPY execute.sh ./execute.sh COPY .project-root ./.project-root @@ -45,4 +46,4 @@ EXPOSE 80 RUN chmod +x execute.sh # Use the below for ease of local development, but remove when pushing to GitHub # Because there is no .env file in the repository (for security reasons) -#COPY .env ./.env +COPY .env ./.env diff --git a/agency_identifier/MuckrockAPIInterface.py b/agency_identifier/MuckrockAPIInterface.py index bbc56ee7..703164fc 100644 --- a/agency_identifier/MuckrockAPIInterface.py +++ b/agency_identifier/MuckrockAPIInterface.py @@ -20,7 +20,7 @@ class AgencyLookupResponse(BaseModel): class MuckrockAPIInterface: - def __init__(self, session: ClientSession): + def __init__(self, session: Optional[ClientSession] = None): self.base_url = "https://www.muckrock.com/api_v1/" self.session = session diff --git a/alembic/env.py b/alembic/env.py index 69587988..7eaa1a8b 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,3 +1,4 @@ +from datetime import datetime from logging.config import fileConfig from alembic import context @@ -59,6 +60,13 @@ def run_migrations_online() -> None: and associate a connection with the context. """ + + def process_revision_directives(context, revision, directives): + # 20210801211024 for a migration generated on Aug 1st, 2021 at 21:10:24 + rev_id = datetime.now().strftime("%Y%m%d%H%M%S") + for directive in directives: + directive.rev_id = rev_id + connectable = engine_from_config( config.get_section(config.config_ini_section, {}), prefix="sqlalchemy.", @@ -67,7 +75,9 @@ def run_migrations_online() -> None: with connectable.connect() as connection: context.configure( - connection=connection, target_metadata=target_metadata + connection=connection, + target_metadata=target_metadata, + process_revision_directives=process_revision_directives ) with context.begin_transaction(): diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index 4db28b9d..62d9930d 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -77,7 +77,6 @@ def upgrade(): FOR EACH ROW EXECUTE FUNCTION enforce_no_agency_id_if_unknown(); """) - # Create user_url_agency_suggestions table op.create_table( "user_url_agency_suggestions", @@ -90,6 +89,26 @@ def upgrade(): op.create_unique_constraint( "uq_user_url_agency_suggestions", "user_url_agency_suggestions", ["agency_id", "url_id", "user_id"] ) + op.execute(""" + CREATE OR REPLACE FUNCTION enforce_no_agency_id_if_new() + RETURNS TRIGGER AS $$ + BEGIN + IF NEW.is_new = TRUE AND NEW.agency_id IS NOT NULL THEN + RAISE EXCEPTION 'agency_id must be null when is_new is TRUE'; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + op.execute(""" + CREATE TRIGGER enforce_no_agency_id_if_new + BEFORE INSERT ON user_url_agency_suggestions + FOR EACH ROW + EXECUTE FUNCTION enforce_no_agency_id_if_new(); + """) + + + op.drop_table('url_agency_suggestions') suggestion_type_enum.drop(op.get_bind(), checkfirst=True) @@ -127,4 +146,6 @@ def downgrade(): op.execute(""" DROP FUNCTION IF EXISTS enforce_no_agency_id_if_unknown; """) + op.execute("DROP TRIGGER enforce_no_agency_id_if_new ON user_url_agency_suggestions") + op.execute("DROP FUNCTION enforce_no_agency_id_if_new()") diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 980c16f9..591920ff 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -76,6 +76,7 @@ async def annotate_url_for_record_type_and_get_next_url( ) return result +@annotate_router.get("/agency") async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), @@ -85,6 +86,7 @@ async def get_next_url_for_agency_annotation( ) return result +@annotate_router.post("/agency/{url_id}") async def annotate_url_for_agency_and_get_next_url( url_id: int, agency_annotation_post_info: URLAgencyAnnotationPostInfo, diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index d89ffc10..2f657c54 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -18,8 +18,11 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion + RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, \ + UserUrlAgencySuggestion from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo @@ -27,6 +30,7 @@ from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.enums import BatchStatus, SuggestionType +from html_tag_collector.DataClassTags import convert_to_response_html_info def add_standard_limit_and_offset(statement, page, limit=100): @@ -610,6 +614,106 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li for raw_result in raw_results ] + @session_manager + async def get_next_url_agency_for_annotation( + self, session: AsyncSession, user_id: int + ) -> GetNextURLForAgencyAnnotationResponse: + """ + Retrieve URL for annotation + The URL must + not be a confirmed URL + not have been annotated by this user + have extant autosuggestions + """ + # Select statement + statement = ( + select(URL.id, URL.url) + # Must not be a confirmed URL + .join(ConfirmedUrlAgency, isouter=True) + .where( + ~exists( + select(ConfirmedUrlAgency). + where(ConfirmedUrlAgency.url_id == URL.id). + correlate(URL) + ) + ) + # Must not have been annotated by this user + .join(UserUrlAgencySuggestion, isouter=True) + .where( + ~exists( + select(UserUrlAgencySuggestion). + where( + (UserUrlAgencySuggestion.user_id == user_id) & + (UserUrlAgencySuggestion.url_id == URL.id) + ). + correlate(URL) + ) + ) + # Must have extant autosuggestions + .join(AutomatedUrlAgencySuggestion, isouter=True) + .where( + exists( + select(AutomatedUrlAgencySuggestion). + where(AutomatedUrlAgencySuggestion.url_id == URL.id). + correlate(URL) + ) + ) + ).limit(1) + raw_result = await session.execute(statement) + results = raw_result.all() + if len(results) == 0: + return GetNextURLForAgencyAnnotationResponse( + next_annotation=None + ) + + result = results[0] + url_id = result[0] + url = result[1] + # Get relevant autosuggestions and agency info, if an associated agency exists + statement = ( + select( + AutomatedUrlAgencySuggestion.agency_id, + AutomatedUrlAgencySuggestion.is_unknown, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .join(Agency, isouter=True) + .where(AutomatedUrlAgencySuggestion.url_id == url_id) + ) + raw_autosuggestions = await session.execute(statement) + autosuggestions = raw_autosuggestions.all() + agency_suggestions = [] + for autosuggestion in autosuggestions: + agency_id = autosuggestion[0] + is_unknown = autosuggestion[1] + name = autosuggestion[2] + state = autosuggestion[3] + county = autosuggestion[4] + locality = autosuggestion[5] + agency_suggestions.append(GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + )) + + # Get HTML content info + html_content_infos = await self.get_html_content_info(url_id) + response_html_info = convert_to_response_html_info(html_content_infos) + + return GetNextURLForAgencyAnnotationResponse( + next_annotation=GetNextURLForAgencyAnnotationInnerResponse( + url_id=url_id, + url=url, + html_info=response_html_info, + agency_suggestions=agency_suggestions + ) + ) + @session_manager async def upsert_new_agencies( self, @@ -656,4 +760,23 @@ async def add_agency_auto_suggestions( ) session.add(url_agency_suggestion) - await session.commit() \ No newline at end of file + await session.commit() + + @session_manager + async def add_agency_manual_suggestion( + self, + session: AsyncSession, + agency_id: Optional[int], + url_id: int, + user_id: int, + is_new: bool + ): + if is_new and agency_id is not None: + raise ValueError("agency_id must be None when is_new is True") + url_agency_suggestion = UserUrlAgencySuggestion( + url_id=url_id, + agency_id=agency_id, + user_id=user_id, + is_new=is_new + ) + session.add(url_agency_suggestion) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 2ec17da5..808821f7 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -44,6 +44,7 @@ def __init__( self.url_request_interface = url_request_interface self.html_parser = html_parser self.logger = logging.getLogger(__name__) + self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) async def get_url_html_task_operator(self): @@ -71,16 +72,14 @@ async def get_url_record_type_task_operator(self): return operator async def get_agency_identification_task_operator(self): - session = ClientSession() pdap_client = PDAPClient( access_manager=AccessManager( email=get_from_env("PDAP_EMAIL"), password=get_from_env("PDAP_PASSWORD"), api_key=get_from_env("PDAP_API_KEY"), - session=session ), ) - muckrock_api_interface = MuckrockAPIInterface(session=session) + muckrock_api_interface = MuckrockAPIInterface() operator = AgencyIdentificationTaskOperator( adb_client=self.adb_client, pdap_client=pdap_client, @@ -208,15 +207,16 @@ async def submit_url_agency_annotation( url_id: int, agency_post_info: URLAgencyAnnotationPostInfo ) -> GetNextURLForAgencyAnnotationResponse: - if agency_post_info.suggested_agency == "NEW": - suggestion_type = SuggestionType.NEW_AGENCY + if not agency_post_info.is_new and not agency_post_info.suggested_agency: + raise ValueError("suggested_agency must be provided if is_new is False") + + if agency_post_info.is_new: agency_suggestion_id = None else: - suggestion_type = SuggestionType.MANUAL_SUGGESTION agency_suggestion_id = agency_post_info.suggested_agency - return await self.adb_client.submit_url_agency_annotation( + return await self.adb_client.add_agency_manual_suggestion( user_id=user_id, url_id=url_id, - suggestion_type=suggestion_type, - agency_suggestion_id=agency_suggestion_id + agency_id=agency_suggestion_id, + is_new=agency_post_info.is_new, ) diff --git a/core/DTOs/GetNextURLForAgencyAnnotationResponse.py b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py index 4710275e..8b3d06f4 100644 --- a/core/DTOs/GetNextURLForAgencyAnnotationResponse.py +++ b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py @@ -1,9 +1,11 @@ from typing import Optional, Literal +from pydantic import BaseModel + from core.enums import SuggestionType from html_tag_collector.DataClassTags import ResponseHTMLInfo -class GetNextURLForAgencyAgencyInfo: +class GetNextURLForAgencyAgencyInfo(BaseModel): suggestion_type: SuggestionType pdap_agency_id: Optional[int] = None agency_name: Optional[str] = None @@ -11,12 +13,17 @@ class GetNextURLForAgencyAgencyInfo: county: Optional[str] = None locality: Optional[str] = None -class GetNextURLForAgencyAnnotationResponse: +class GetNextURLForAgencyAnnotationInnerResponse(BaseModel): url_id: int + url: str agency_suggestions: list[ GetNextURLForAgencyAgencyInfo ] html_info: ResponseHTMLInfo -class URLAgencyAnnotationPostInfo: - suggested_agency: int | Literal["NEW"] \ No newline at end of file +class GetNextURLForAgencyAnnotationResponse(BaseModel): + next_annotation: Optional[GetNextURLForAgencyAnnotationInnerResponse] + +class URLAgencyAnnotationPostInfo(BaseModel): + is_new: bool = False + suggested_agency: Optional[int] = None \ No newline at end of file diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/AgencyIdentificationTaskOperator.py index de27f6cb..1589b96f 100644 --- a/core/classes/AgencyIdentificationTaskOperator.py +++ b/core/classes/AgencyIdentificationTaskOperator.py @@ -1,3 +1,5 @@ +from aiohttp import ClientSession + from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo @@ -67,33 +69,36 @@ async def run_subtask(subtask, url_id, collector_metadata) -> list[URLAgencySugg return await subtask.run(url_id=url_id, collector_metadata=collector_metadata) async def inner_task_logic(self): - tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - error_infos = [] - all_agency_suggestions = [] - for tdo in tdos: - subtask = await self.get_subtask(tdo.collector_type) - try: - new_agency_suggestions = await self.run_subtask( - subtask, - tdo.url_id, - tdo.collector_metadata - ) - all_agency_suggestions.extend(new_agency_suggestions) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) + async with ClientSession() as session: + self.pdap_client.access_manager.session = session + self.muckrock_api_interface.session = session + tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() + await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) + error_infos = [] + all_agency_suggestions = [] + for tdo in tdos: + subtask = await self.get_subtask(tdo.collector_type) + try: + new_agency_suggestions = await self.run_subtask( + subtask, + tdo.url_id, + tdo.collector_metadata + ) + all_agency_suggestions.extend(new_agency_suggestions) + except Exception as e: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_id, + error=str(e), + ) + error_infos.append(error_info) - non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN] - await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) - confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] - await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED] - await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) - await self.adb_client.add_url_error_infos(error_infos) + non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN] + await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) + confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] + await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) + non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED] + await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) + await self.adb_client.add_url_error_infos(error_infos) diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py index 0776fe0d..c39ba1e8 100644 --- a/pdap_api_client/AccessManager.py +++ b/pdap_api_client/AccessManager.py @@ -35,9 +35,9 @@ class AccessManager: """ def __init__( self, - session: ClientSession, email: str, password: str, + session: Optional[ClientSession] = None, api_key: Optional[str] = None, ): self.session = session @@ -46,7 +46,6 @@ def __init__( self.api_key = api_key self.email = email self.password = password - self.login(email=email, password=password) @property async def access_token(self): diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 2d6b603f..d550e801 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,3 +1,4 @@ +from random import randint from typing import List, Optional from pydantic import BaseModel @@ -13,7 +14,8 @@ from collector_db.DatabaseClient import DatabaseClient from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_manager.enums import CollectorType -from core.enums import BatchStatus +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.enums import BatchStatus, SuggestionType from tests.helpers.simple_test_data_functions import generate_test_urls @@ -60,6 +62,72 @@ async def batch_and_urls( return BatchURLCreationInfo(batch_id=batch_id, url_ids=url_ids) + async def agency(self) -> int: + agency_id = randint(1, 99999999) + await self.adb_client.upsert_new_agencies( + suggestions=[ + URLAgencySuggestionInfo( + url_id=-1, + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=agency_id, + agency_name=f"Test Agency {agency_id}", + state=f"Test State {agency_id}", + county=f"Test County {agency_id}", + locality=f"Test Locality {agency_id}" + ) + ] + ) + return agency_id + + async def auto_suggestions( + self, + url_ids: list[int], + num_suggestions: int, + suggestion_type: SuggestionType.AUTO_SUGGESTION or SuggestionType.UNKNOWN + ): + allowed_suggestion_types = [SuggestionType.AUTO_SUGGESTION, SuggestionType.UNKNOWN] + if suggestion_type not in allowed_suggestion_types: + raise ValueError(f"suggestion_type must be one of {allowed_suggestion_types}") + if suggestion_type == SuggestionType.UNKNOWN and num_suggestions > 1: + raise ValueError("num_suggestions must be 1 when suggestion_type is unknown") + + for url_id in url_ids: + suggestions = [] + for i in range(num_suggestions): + if suggestion_type == SuggestionType.UNKNOWN: + agency_id = None + else: + agency_id = await self.agency() + suggestion = URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=suggestion_type, + pdap_agency_id=agency_id + ) + suggestions.append(suggestion) + + await self.adb_client.add_agency_auto_suggestions( + suggestions=suggestions + ) + + async def confirmed_suggestions(self, url_ids: list[int]): + for url_id in url_ids: + await self.adb_client.add_confirmed_agency_url_links( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=await self.agency() + ) + ] + ) + + async def manual_suggestion(self, user_id: int, url_id: int, is_new: bool = False): + await self.adb_client.add_agency_manual_suggestion( + agency_id=await self.agency(), + url_id=url_id, + user_id=user_id, + is_new=is_new + ) def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index 8f80716f..d9a504a7 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -20,6 +20,9 @@ class APITestHelper: mock_huggingface_interface: MagicMock mock_label_studio_interface: MagicMock + def adb_client(self): + return self.db_data_creator.adb_client + MOCK_USER_ID = 1 diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index d3e60e1d..38b7cafd 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,6 +12,8 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse @@ -210,6 +212,23 @@ def post_relevance_annotation_and_get_next( ) return GetNextURLForAnnotationResponse(**data) + async def get_next_agency_annotation(self) -> GetNextURLForAgencyAnnotationResponse: + data = self.get( + url=f"/annotate/agency" + ) + return GetNextURLForAgencyAnnotationResponse(**data) + + async def post_agency_annotation_and_get_next( + self, + url_id: int, + agency_annotation_post_info: URLAgencyAnnotationPostInfo + ) -> GetNextURLForAgencyAnnotationResponse: + data = self.post( + url=f"/annotate/agency/{url_id}", + json=agency_annotation_post_info.model_dump(mode='json') + ) + return GetNextURLForAgencyAnnotationResponse(**data) + def get_urls(self, page: int = 1, errors: bool = False) -> GetURLsResponseInfo: data = self.get( url=f"/url", diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 1ee03963..09e5a267 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -4,10 +4,13 @@ from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource +from collector_db.models import UserUrlAgencySuggestion +from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo -from core.enums import RecordType +from core.enums import RecordType, SuggestionType +from helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID async def run_annotation_test( @@ -110,3 +113,223 @@ async def test_annotate_record_type(api_test_helper): metadata_attribute=URLMetadataAttributeType.RECORD_TYPE, expected_metadata_value=RecordType.ACCIDENT_REPORTS.value ) + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that two agency_suggestions exist + assert len(next_annotation.agency_suggestions) == 2 + + for agency_suggestion in next_annotation.agency_suggestions: + assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION + assert agency_suggestion.pdap_agency_id is not None + assert agency_suggestion.agency_name is not None + assert agency_suggestion.state is not None + assert agency_suggestion.county is not None + assert agency_suggestion.locality is not None + + +@pytest.mark.asyncio +async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): + """ + Test Scenario: Single Unknown Auto Suggestion + A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User + The user should receive a single Unknown Auto Suggestion lacking other detail + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + agency_suggestion = next_annotation.agency_suggestions[0] + + assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN + assert agency_suggestion.pdap_agency_id is None + assert agency_suggestion.agency_name is None + assert agency_suggestion.state is None + assert agency_suggestion.county is None + assert agency_suggestion.locality is None + + +@pytest.mark.asyncio +async def test_annotate_agency_single_confirmed_agency(api_test_helper): + """ + Test Scenario: Single Confirmed Agency + A URL has a single Confirmed Agency and has not been annotated by the User + The user should not receive this URL to annotate + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.confirmed_suggestions( + url_ids=buci.url_ids, + ) + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None + +@pytest.mark.asyncio +async def test_annotate_agency_other_user_annotation(api_test_helper): + """ + Test Scenario: Other User Annotation + A URL has been annotated by another User + Our user should still receive this URL to annotate + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=buci.url_ids[0], + ) + + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + +@pytest.mark.asyncio +async def test_annotate_agency_submit_and_get_next(api_test_helper): + """ + Test Scenario: Submit and Get Next (no other URL available) + A URL has been annotated by our User, and no other valid URLs have not been annotated + Our user should not receive another URL to annotate + Until another relevant URL is added + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=2, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + # User should submit an annotation and receive the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + + ) + assert response.next_annotation is not None + + # User should submit this annotation and receive none for the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[1], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + ) + assert response.next_annotation is None + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_new(api_test_helper): + """ + Test Scenario: Submit New + Our user receives an annotation and marks it as `NEW` + This should complete successfully + And within the database the annotation should be marked as `NEW` + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + # User should submit an annotation and mark it as New + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=True + ) + ) + assert response.next_annotation is None + + # Within database, the annotation should be marked as `NEW` + all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_manual_suggestions) == 1 + assert all_manual_suggestions[0].is_new + diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index cdf93801..9c31c9cf 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -330,7 +330,7 @@ def test_root_url(db_data_creator: DBDataCreator): @pytest.mark.asyncio -async def test_upset_new_agencies(db_data_creator: DBDataCreator): +async def test_upsert_new_agencies(db_data_creator: DBDataCreator): """ Check that if the agency doesn't exist, it is added But if the agency does exist, it is updated with new information @@ -377,4 +377,3 @@ async def test_upset_new_agencies(db_data_creator: DBDataCreator): assert d[0] == "Updated Test Agency" assert d[1] == "Test Agency 1" assert d[2] == "Test Agency 2" - From f98dcd732c6075b358f380ac3ed60138250d9737 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 10 Feb 2025 09:34:20 -0500 Subject: [PATCH 049/182] Revise agency identification annotation logic --- Dockerfile | 3 +- agency_identifier/MuckrockAPIInterface.py | 2 +- alembic/env.py | 12 +- ...daf0_revise_agency_identification_logic.py | 23 +- api/routes/annotate.py | 2 + collector_db/AsyncDatabaseClient.py | 127 +++++++++- core/AsyncCore.py | 18 +- .../GetNextURLForAgencyAnnotationResponse.py | 15 +- .../AgencyIdentificationTaskOperator.py | 59 ++--- pdap_api_client/AccessManager.py | 3 +- tests/helpers/DBDataCreator.py | 70 +++++- .../integration/api/conftest.py | 3 + .../api/helpers/RequestValidator.py | 19 ++ .../integration/api/test_annotate.py | 225 +++++++++++++++++- .../collector_db/test_database_structure.py | 3 +- .../integration/core/test_async_core.py | 2 +- 16 files changed, 533 insertions(+), 53 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6cf1d6a2..c93fe158 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,7 @@ COPY alembic.ini ./alembic.ini COPY alembic ./alembic COPY apply_migrations.py ./apply_migrations.py COPY security_manager ./security_manager +COPY pdap_api_client ./pdap_api_client COPY execute.sh ./execute.sh COPY .project-root ./.project-root @@ -45,4 +46,4 @@ EXPOSE 80 RUN chmod +x execute.sh # Use the below for ease of local development, but remove when pushing to GitHub # Because there is no .env file in the repository (for security reasons) -#COPY .env ./.env +COPY .env ./.env diff --git a/agency_identifier/MuckrockAPIInterface.py b/agency_identifier/MuckrockAPIInterface.py index bbc56ee7..703164fc 100644 --- a/agency_identifier/MuckrockAPIInterface.py +++ b/agency_identifier/MuckrockAPIInterface.py @@ -20,7 +20,7 @@ class AgencyLookupResponse(BaseModel): class MuckrockAPIInterface: - def __init__(self, session: ClientSession): + def __init__(self, session: Optional[ClientSession] = None): self.base_url = "https://www.muckrock.com/api_v1/" self.session = session diff --git a/alembic/env.py b/alembic/env.py index 69587988..7eaa1a8b 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,3 +1,4 @@ +from datetime import datetime from logging.config import fileConfig from alembic import context @@ -59,6 +60,13 @@ def run_migrations_online() -> None: and associate a connection with the context. """ + + def process_revision_directives(context, revision, directives): + # 20210801211024 for a migration generated on Aug 1st, 2021 at 21:10:24 + rev_id = datetime.now().strftime("%Y%m%d%H%M%S") + for directive in directives: + directive.rev_id = rev_id + connectable = engine_from_config( config.get_section(config.config_ini_section, {}), prefix="sqlalchemy.", @@ -67,7 +75,9 @@ def run_migrations_online() -> None: with connectable.connect() as connection: context.configure( - connection=connection, target_metadata=target_metadata + connection=connection, + target_metadata=target_metadata, + process_revision_directives=process_revision_directives ) with context.begin_transaction(): diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index 4db28b9d..62d9930d 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -77,7 +77,6 @@ def upgrade(): FOR EACH ROW EXECUTE FUNCTION enforce_no_agency_id_if_unknown(); """) - # Create user_url_agency_suggestions table op.create_table( "user_url_agency_suggestions", @@ -90,6 +89,26 @@ def upgrade(): op.create_unique_constraint( "uq_user_url_agency_suggestions", "user_url_agency_suggestions", ["agency_id", "url_id", "user_id"] ) + op.execute(""" + CREATE OR REPLACE FUNCTION enforce_no_agency_id_if_new() + RETURNS TRIGGER AS $$ + BEGIN + IF NEW.is_new = TRUE AND NEW.agency_id IS NOT NULL THEN + RAISE EXCEPTION 'agency_id must be null when is_new is TRUE'; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + op.execute(""" + CREATE TRIGGER enforce_no_agency_id_if_new + BEFORE INSERT ON user_url_agency_suggestions + FOR EACH ROW + EXECUTE FUNCTION enforce_no_agency_id_if_new(); + """) + + + op.drop_table('url_agency_suggestions') suggestion_type_enum.drop(op.get_bind(), checkfirst=True) @@ -127,4 +146,6 @@ def downgrade(): op.execute(""" DROP FUNCTION IF EXISTS enforce_no_agency_id_if_unknown; """) + op.execute("DROP TRIGGER enforce_no_agency_id_if_new ON user_url_agency_suggestions") + op.execute("DROP FUNCTION enforce_no_agency_id_if_new()") diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 980c16f9..591920ff 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -76,6 +76,7 @@ async def annotate_url_for_record_type_and_get_next_url( ) return result +@annotate_router.get("/agency") async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), @@ -85,6 +86,7 @@ async def get_next_url_for_agency_annotation( ) return result +@annotate_router.post("/agency/{url_id}") async def annotate_url_for_agency_and_get_next_url( url_id: int, agency_annotation_post_info: URLAgencyAnnotationPostInfo, diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index d89ffc10..2f657c54 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -18,8 +18,11 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion + RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, \ + UserUrlAgencySuggestion from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo @@ -27,6 +30,7 @@ from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.enums import BatchStatus, SuggestionType +from html_tag_collector.DataClassTags import convert_to_response_html_info def add_standard_limit_and_offset(statement, page, limit=100): @@ -610,6 +614,106 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li for raw_result in raw_results ] + @session_manager + async def get_next_url_agency_for_annotation( + self, session: AsyncSession, user_id: int + ) -> GetNextURLForAgencyAnnotationResponse: + """ + Retrieve URL for annotation + The URL must + not be a confirmed URL + not have been annotated by this user + have extant autosuggestions + """ + # Select statement + statement = ( + select(URL.id, URL.url) + # Must not be a confirmed URL + .join(ConfirmedUrlAgency, isouter=True) + .where( + ~exists( + select(ConfirmedUrlAgency). + where(ConfirmedUrlAgency.url_id == URL.id). + correlate(URL) + ) + ) + # Must not have been annotated by this user + .join(UserUrlAgencySuggestion, isouter=True) + .where( + ~exists( + select(UserUrlAgencySuggestion). + where( + (UserUrlAgencySuggestion.user_id == user_id) & + (UserUrlAgencySuggestion.url_id == URL.id) + ). + correlate(URL) + ) + ) + # Must have extant autosuggestions + .join(AutomatedUrlAgencySuggestion, isouter=True) + .where( + exists( + select(AutomatedUrlAgencySuggestion). + where(AutomatedUrlAgencySuggestion.url_id == URL.id). + correlate(URL) + ) + ) + ).limit(1) + raw_result = await session.execute(statement) + results = raw_result.all() + if len(results) == 0: + return GetNextURLForAgencyAnnotationResponse( + next_annotation=None + ) + + result = results[0] + url_id = result[0] + url = result[1] + # Get relevant autosuggestions and agency info, if an associated agency exists + statement = ( + select( + AutomatedUrlAgencySuggestion.agency_id, + AutomatedUrlAgencySuggestion.is_unknown, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .join(Agency, isouter=True) + .where(AutomatedUrlAgencySuggestion.url_id == url_id) + ) + raw_autosuggestions = await session.execute(statement) + autosuggestions = raw_autosuggestions.all() + agency_suggestions = [] + for autosuggestion in autosuggestions: + agency_id = autosuggestion[0] + is_unknown = autosuggestion[1] + name = autosuggestion[2] + state = autosuggestion[3] + county = autosuggestion[4] + locality = autosuggestion[5] + agency_suggestions.append(GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + )) + + # Get HTML content info + html_content_infos = await self.get_html_content_info(url_id) + response_html_info = convert_to_response_html_info(html_content_infos) + + return GetNextURLForAgencyAnnotationResponse( + next_annotation=GetNextURLForAgencyAnnotationInnerResponse( + url_id=url_id, + url=url, + html_info=response_html_info, + agency_suggestions=agency_suggestions + ) + ) + @session_manager async def upsert_new_agencies( self, @@ -656,4 +760,23 @@ async def add_agency_auto_suggestions( ) session.add(url_agency_suggestion) - await session.commit() \ No newline at end of file + await session.commit() + + @session_manager + async def add_agency_manual_suggestion( + self, + session: AsyncSession, + agency_id: Optional[int], + url_id: int, + user_id: int, + is_new: bool + ): + if is_new and agency_id is not None: + raise ValueError("agency_id must be None when is_new is True") + url_agency_suggestion = UserUrlAgencySuggestion( + url_id=url_id, + agency_id=agency_id, + user_id=user_id, + is_new=is_new + ) + session.add(url_agency_suggestion) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 2ec17da5..808821f7 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -44,6 +44,7 @@ def __init__( self.url_request_interface = url_request_interface self.html_parser = html_parser self.logger = logging.getLogger(__name__) + self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) async def get_url_html_task_operator(self): @@ -71,16 +72,14 @@ async def get_url_record_type_task_operator(self): return operator async def get_agency_identification_task_operator(self): - session = ClientSession() pdap_client = PDAPClient( access_manager=AccessManager( email=get_from_env("PDAP_EMAIL"), password=get_from_env("PDAP_PASSWORD"), api_key=get_from_env("PDAP_API_KEY"), - session=session ), ) - muckrock_api_interface = MuckrockAPIInterface(session=session) + muckrock_api_interface = MuckrockAPIInterface() operator = AgencyIdentificationTaskOperator( adb_client=self.adb_client, pdap_client=pdap_client, @@ -208,15 +207,16 @@ async def submit_url_agency_annotation( url_id: int, agency_post_info: URLAgencyAnnotationPostInfo ) -> GetNextURLForAgencyAnnotationResponse: - if agency_post_info.suggested_agency == "NEW": - suggestion_type = SuggestionType.NEW_AGENCY + if not agency_post_info.is_new and not agency_post_info.suggested_agency: + raise ValueError("suggested_agency must be provided if is_new is False") + + if agency_post_info.is_new: agency_suggestion_id = None else: - suggestion_type = SuggestionType.MANUAL_SUGGESTION agency_suggestion_id = agency_post_info.suggested_agency - return await self.adb_client.submit_url_agency_annotation( + return await self.adb_client.add_agency_manual_suggestion( user_id=user_id, url_id=url_id, - suggestion_type=suggestion_type, - agency_suggestion_id=agency_suggestion_id + agency_id=agency_suggestion_id, + is_new=agency_post_info.is_new, ) diff --git a/core/DTOs/GetNextURLForAgencyAnnotationResponse.py b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py index 4710275e..8b3d06f4 100644 --- a/core/DTOs/GetNextURLForAgencyAnnotationResponse.py +++ b/core/DTOs/GetNextURLForAgencyAnnotationResponse.py @@ -1,9 +1,11 @@ from typing import Optional, Literal +from pydantic import BaseModel + from core.enums import SuggestionType from html_tag_collector.DataClassTags import ResponseHTMLInfo -class GetNextURLForAgencyAgencyInfo: +class GetNextURLForAgencyAgencyInfo(BaseModel): suggestion_type: SuggestionType pdap_agency_id: Optional[int] = None agency_name: Optional[str] = None @@ -11,12 +13,17 @@ class GetNextURLForAgencyAgencyInfo: county: Optional[str] = None locality: Optional[str] = None -class GetNextURLForAgencyAnnotationResponse: +class GetNextURLForAgencyAnnotationInnerResponse(BaseModel): url_id: int + url: str agency_suggestions: list[ GetNextURLForAgencyAgencyInfo ] html_info: ResponseHTMLInfo -class URLAgencyAnnotationPostInfo: - suggested_agency: int | Literal["NEW"] \ No newline at end of file +class GetNextURLForAgencyAnnotationResponse(BaseModel): + next_annotation: Optional[GetNextURLForAgencyAnnotationInnerResponse] + +class URLAgencyAnnotationPostInfo(BaseModel): + is_new: bool = False + suggested_agency: Optional[int] = None \ No newline at end of file diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/AgencyIdentificationTaskOperator.py index de27f6cb..1589b96f 100644 --- a/core/classes/AgencyIdentificationTaskOperator.py +++ b/core/classes/AgencyIdentificationTaskOperator.py @@ -1,3 +1,5 @@ +from aiohttp import ClientSession + from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo @@ -67,33 +69,36 @@ async def run_subtask(subtask, url_id, collector_metadata) -> list[URLAgencySugg return await subtask.run(url_id=url_id, collector_metadata=collector_metadata) async def inner_task_logic(self): - tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - error_infos = [] - all_agency_suggestions = [] - for tdo in tdos: - subtask = await self.get_subtask(tdo.collector_type) - try: - new_agency_suggestions = await self.run_subtask( - subtask, - tdo.url_id, - tdo.collector_metadata - ) - all_agency_suggestions.extend(new_agency_suggestions) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) + async with ClientSession() as session: + self.pdap_client.access_manager.session = session + self.muckrock_api_interface.session = session + tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() + await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) + error_infos = [] + all_agency_suggestions = [] + for tdo in tdos: + subtask = await self.get_subtask(tdo.collector_type) + try: + new_agency_suggestions = await self.run_subtask( + subtask, + tdo.url_id, + tdo.collector_metadata + ) + all_agency_suggestions.extend(new_agency_suggestions) + except Exception as e: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_id, + error=str(e), + ) + error_infos.append(error_info) - non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN] - await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) - confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] - await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED] - await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) - await self.adb_client.add_url_error_infos(error_infos) + non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN] + await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) + confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED] + await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) + non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED] + await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) + await self.adb_client.add_url_error_infos(error_infos) diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py index 0776fe0d..c39ba1e8 100644 --- a/pdap_api_client/AccessManager.py +++ b/pdap_api_client/AccessManager.py @@ -35,9 +35,9 @@ class AccessManager: """ def __init__( self, - session: ClientSession, email: str, password: str, + session: Optional[ClientSession] = None, api_key: Optional[str] = None, ): self.session = session @@ -46,7 +46,6 @@ def __init__( self.api_key = api_key self.email = email self.password = password - self.login(email=email, password=password) @property async def access_token(self): diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 2d6b603f..d550e801 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,3 +1,4 @@ +from random import randint from typing import List, Optional from pydantic import BaseModel @@ -13,7 +14,8 @@ from collector_db.DatabaseClient import DatabaseClient from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_manager.enums import CollectorType -from core.enums import BatchStatus +from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.enums import BatchStatus, SuggestionType from tests.helpers.simple_test_data_functions import generate_test_urls @@ -60,6 +62,72 @@ async def batch_and_urls( return BatchURLCreationInfo(batch_id=batch_id, url_ids=url_ids) + async def agency(self) -> int: + agency_id = randint(1, 99999999) + await self.adb_client.upsert_new_agencies( + suggestions=[ + URLAgencySuggestionInfo( + url_id=-1, + suggestion_type=SuggestionType.UNKNOWN, + pdap_agency_id=agency_id, + agency_name=f"Test Agency {agency_id}", + state=f"Test State {agency_id}", + county=f"Test County {agency_id}", + locality=f"Test Locality {agency_id}" + ) + ] + ) + return agency_id + + async def auto_suggestions( + self, + url_ids: list[int], + num_suggestions: int, + suggestion_type: SuggestionType.AUTO_SUGGESTION or SuggestionType.UNKNOWN + ): + allowed_suggestion_types = [SuggestionType.AUTO_SUGGESTION, SuggestionType.UNKNOWN] + if suggestion_type not in allowed_suggestion_types: + raise ValueError(f"suggestion_type must be one of {allowed_suggestion_types}") + if suggestion_type == SuggestionType.UNKNOWN and num_suggestions > 1: + raise ValueError("num_suggestions must be 1 when suggestion_type is unknown") + + for url_id in url_ids: + suggestions = [] + for i in range(num_suggestions): + if suggestion_type == SuggestionType.UNKNOWN: + agency_id = None + else: + agency_id = await self.agency() + suggestion = URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=suggestion_type, + pdap_agency_id=agency_id + ) + suggestions.append(suggestion) + + await self.adb_client.add_agency_auto_suggestions( + suggestions=suggestions + ) + + async def confirmed_suggestions(self, url_ids: list[int]): + for url_id in url_ids: + await self.adb_client.add_confirmed_agency_url_links( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=await self.agency() + ) + ] + ) + + async def manual_suggestion(self, user_id: int, url_id: int, is_new: bool = False): + await self.adb_client.add_agency_manual_suggestion( + agency_id=await self.agency(), + url_id=url_id, + user_id=user_id, + is_new=is_new + ) def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index 8f80716f..d9a504a7 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -20,6 +20,9 @@ class APITestHelper: mock_huggingface_interface: MagicMock mock_label_studio_interface: MagicMock + def adb_client(self): + return self.db_data_creator.adb_client + MOCK_USER_ID = 1 diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index d3e60e1d..38b7cafd 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,6 +12,8 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ + URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse @@ -210,6 +212,23 @@ def post_relevance_annotation_and_get_next( ) return GetNextURLForAnnotationResponse(**data) + async def get_next_agency_annotation(self) -> GetNextURLForAgencyAnnotationResponse: + data = self.get( + url=f"/annotate/agency" + ) + return GetNextURLForAgencyAnnotationResponse(**data) + + async def post_agency_annotation_and_get_next( + self, + url_id: int, + agency_annotation_post_info: URLAgencyAnnotationPostInfo + ) -> GetNextURLForAgencyAnnotationResponse: + data = self.post( + url=f"/annotate/agency/{url_id}", + json=agency_annotation_post_info.model_dump(mode='json') + ) + return GetNextURLForAgencyAnnotationResponse(**data) + def get_urls(self, page: int = 1, errors: bool = False) -> GetURLsResponseInfo: data = self.get( url=f"/url", diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 1ee03963..ef3693c2 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -4,10 +4,13 @@ from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource +from collector_db.models import UserUrlAgencySuggestion +from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo -from core.enums import RecordType +from core.enums import RecordType, SuggestionType +from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID async def run_annotation_test( @@ -110,3 +113,223 @@ async def test_annotate_record_type(api_test_helper): metadata_attribute=URLMetadataAttributeType.RECORD_TYPE, expected_metadata_value=RecordType.ACCIDENT_REPORTS.value ) + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that two agency_suggestions exist + assert len(next_annotation.agency_suggestions) == 2 + + for agency_suggestion in next_annotation.agency_suggestions: + assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION + assert agency_suggestion.pdap_agency_id is not None + assert agency_suggestion.agency_name is not None + assert agency_suggestion.state is not None + assert agency_suggestion.county is not None + assert agency_suggestion.locality is not None + + +@pytest.mark.asyncio +async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): + """ + Test Scenario: Single Unknown Auto Suggestion + A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User + The user should receive a single Unknown Auto Suggestion lacking other detail + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + agency_suggestion = next_annotation.agency_suggestions[0] + + assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN + assert agency_suggestion.pdap_agency_id is None + assert agency_suggestion.agency_name is None + assert agency_suggestion.state is None + assert agency_suggestion.county is None + assert agency_suggestion.locality is None + + +@pytest.mark.asyncio +async def test_annotate_agency_single_confirmed_agency(api_test_helper): + """ + Test Scenario: Single Confirmed Agency + A URL has a single Confirmed Agency and has not been annotated by the User + The user should not receive this URL to annotate + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.confirmed_suggestions( + url_ids=buci.url_ids, + ) + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None + +@pytest.mark.asyncio +async def test_annotate_agency_other_user_annotation(api_test_helper): + """ + Test Scenario: Other User Annotation + A URL has been annotated by another User + Our user should still receive this URL to annotate + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=buci.url_ids[0], + ) + + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + +@pytest.mark.asyncio +async def test_annotate_agency_submit_and_get_next(api_test_helper): + """ + Test Scenario: Submit and Get Next (no other URL available) + A URL has been annotated by our User, and no other valid URLs have not been annotated + Our user should not receive another URL to annotate + Until another relevant URL is added + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=2, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + # User should submit an annotation and receive the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + + ) + assert response.next_annotation is not None + + # User should submit this annotation and receive none for the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[1], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + ) + assert response.next_annotation is None + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_new(api_test_helper): + """ + Test Scenario: Submit New + Our user receives an annotation and marks it as `NEW` + This should complete successfully + And within the database the annotation should be marked as `NEW` + """ + ath = api_test_helper + adb_client = ath.adb_client() + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + + # User should submit an annotation and mark it as New + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=buci.url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=True + ) + ) + assert response.next_annotation is None + + # Within database, the annotation should be marked as `NEW` + all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_manual_suggestions) == 1 + assert all_manual_suggestions[0].is_new + diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index cdf93801..9c31c9cf 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -330,7 +330,7 @@ def test_root_url(db_data_creator: DBDataCreator): @pytest.mark.asyncio -async def test_upset_new_agencies(db_data_creator: DBDataCreator): +async def test_upsert_new_agencies(db_data_creator: DBDataCreator): """ Check that if the agency doesn't exist, it is added But if the agency does exist, it is updated with new information @@ -377,4 +377,3 @@ async def test_upset_new_agencies(db_data_creator: DBDataCreator): assert d[0] == "Updated Test Agency" assert d[1] == "Test Agency 1" assert d[2] == "Test Agency 2" - diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index 3b99d15c..ad72f2bf 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -8,7 +8,7 @@ from core.AsyncCore import AsyncCore from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.enums import BatchStatus -from helpers.DBDataCreator import DBDataCreator +from test.helpers.DBDataCreator import DBDataCreator @pytest.mark.asyncio async def test_conclude_task_success(db_data_creator: DBDataCreator): From 31266b611c31237f524e364538a1d5dab569fa44 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 10 Feb 2025 09:52:17 -0500 Subject: [PATCH 050/182] Comment out `COPY .env` --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c93fe158..dfcb1392 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,4 +46,4 @@ EXPOSE 80 RUN chmod +x execute.sh # Use the below for ease of local development, but remove when pushing to GitHub # Because there is no .env file in the repository (for security reasons) -COPY .env ./.env +#COPY .env ./.env From a049497688a997cd1abc93a2ae69ff387d6a5ff1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Feb 2025 08:55:36 -0500 Subject: [PATCH 051/182] DRAFT --- ENV.md | 10 +- api/main.py | 3 + api/routes/review.py | 19 ++ collector_db/AsyncDatabaseClient.py | 198 +++++++++++++++- collector_db/DTOConverter.py | 198 ++++++++++++++++ collector_db/models.py | 6 +- core/AsyncCore.py | 5 +- core/DTOs/GetNextURLForFinalReviewResponse.py | 64 ++++++ security_manager/SecurityManager.py | 1 + tests/helpers/DBDataCreator.py | 75 +++++- .../collector_db/test_db_client.py | 215 +++++++++++++++++- 11 files changed, 770 insertions(+), 24 deletions(-) create mode 100644 api/routes/review.py create mode 100644 collector_db/DTOConverter.py create mode 100644 core/DTOs/GetNextURLForFinalReviewResponse.py diff --git a/ENV.md b/ENV.md index 8fd30c33..68359348 100644 --- a/ENV.md +++ b/ENV.md @@ -14,11 +14,11 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | |`POSTGRES_HOST` | The host for the test database | `127.0.0.1` | |`POSTGRES_PORT` | The port for the test database | `5432` | -|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token that is used in the Data Sources App for encoding. | `abc123` | +|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | |`DEV`| Set to any value to run the application in development mode. | `true` | |`DEEPSEEK_API_KEY`| The API key required for accessing the DeepSeek API. | `abc123` | -|`OPENAI_API_KEY`| The API key required for accessing the OpenAI API.| `abc123` | -|`PDAP_EMAIL`| An email address for accessing the PDAP API.| `abc123@test.com` | -|`PDAP_PASSWORD`| A password for accessing the PDAP API.| `abc123` | -|`PDAP_API_KEY`| An API key for accessing the PDAP API.| `abc123` | +|`OPENAI_API_KEY`| The API key required for accessing the OpenAI API. | `abc123` | +|`PDAP_EMAIL`| An email address for accessing the PDAP API. | `abc123@test.com` | +|`PDAP_PASSWORD`| A password for accessing the PDAP API. | `abc123` | +|`PDAP_API_KEY`| An API key for accessing the PDAP API. | `abc123` | diff --git a/api/main.py b/api/main.py index 0a9c0249..34cebdd1 100644 --- a/api/main.py +++ b/api/main.py @@ -1,5 +1,6 @@ from contextlib import asynccontextmanager +import uvicorn from fastapi import FastAPI from api.routes.annotate import annotate_router @@ -84,3 +85,5 @@ async def setup_database(db_client): app.include_router(router) +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/api/routes/review.py b/api/routes/review.py new file mode 100644 index 00000000..f1b7210a --- /dev/null +++ b/api/routes/review.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter, Depends + +from api.dependencies import get_async_core +from core.AsyncCore import AsyncCore +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse +from security_manager.SecurityManager import AccessInfo, get_access_info + +review_router = APIRouter( + prefix="/review", + tags=["Review"], + responses={404: {"description": "Not found"}}, +) + +@review_router.get("/next-source") +async def get_next_source( + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), +) -> GetNextURLForFinalReviewResponse: + return await core.get_next_source_for_review() \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 2f657c54..62a44fbf 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,11 +1,12 @@ from functools import wraps from typing import Optional -from sqlalchemy import select, exists, func +from sqlalchemy import select, exists, func, distinct, case, desc, asc from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload, aliased +from sqlalchemy.orm import selectinload, aliased, joinedload from collector_db.ConfigManager import ConfigManager +from collector_db.DTOConverter import DTOConverter from collector_db.DTOs.MetadataAnnotationInfo import MetadataAnnotationInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo @@ -23,6 +24,8 @@ from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ + FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyInfo from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo @@ -47,10 +50,12 @@ def __init__(self, db_url: str = get_postgres_connection_string(is_async=True)): self.statement_composer = StatementComposer() @staticmethod - def _add_models(session: AsyncSession, model_class, models): - for model in models: - instance = model_class(**model.model_dump()) - session.add(instance) + async def _add_models(session: AsyncSession, model_class, models) -> list[int]: + instances = [model_class(**model.model_dump()) for model in models] + session.add_all(instances) + await session.flush() + return [instance.id for instance in instances] + @staticmethod @@ -86,12 +91,13 @@ async def get_url_metadata_by_status( return [URLMetadataInfo(**url_metadata.__dict__) for url_metadata in model_result] @session_manager - async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo): - self._add_models(session, URLMetadata, [url_metadata_info]) + async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo) -> int: + result = await self._add_models(session, URLMetadata, [url_metadata_info]) + return result[0] @session_manager - async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]): - self._add_models(session, URLMetadata, url_metadata_infos) + async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]) -> list[int]: + return await self._add_models(session, URLMetadata, url_metadata_infos) @session_manager async def add_url_error_infos(self, session: AsyncSession, url_error_infos: list[URLErrorPydanticInfo]): @@ -125,7 +131,7 @@ async def get_urls_with_errors(self, session: AsyncSession) -> list[URLErrorPyda @session_manager async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - self._add_models(session, URLHTMLContent, html_content_infos) + await self._add_models(session, URLHTMLContent, html_content_infos) @session_manager async def has_pending_urls_without_html_data(self, session: AsyncSession) -> bool: @@ -312,7 +318,7 @@ async def get_next_url_for_annotation( return annotation_info @session_manager - async def add_relevance_annotation( + async def add_metadata_annotation( self, session: AsyncSession, user_id: int, @@ -780,3 +786,171 @@ async def add_agency_manual_suggestion( is_new=is_new ) session.add(url_agency_suggestion) + + @session_manager + async def get_next_url_for_final_review(self, session: AsyncSession) -> GetNextURLForFinalReviewResponse: + + # Subqueries for ORDER clause + + # Subqueries for Counting distinct annotations + # Count distinct auto annotations for metadata + distinct_auto_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(distinct(URLMetadata.attribute)).label("auto_count") + ). + group_by(URLMetadata.url_id).subquery() + ) + # Count distinct user annotations for metadata + distinct_user_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(distinct(URLMetadata.attribute)).label("user_count") + ).join(MetadataAnnotation). + where(MetadataAnnotation.user_id != None). + group_by(URLMetadata.url_id).subquery() + ) + + + # Count whether agency auto annotations exist + # (Note: Can be either confirmed or auto suggestion) + agency_annotations_exist_subquery = ( + select( + URL.id, + case( + ( + exists().where(URL.id == ConfirmedUrlAgency.url_id), 1 + ), + ( + exists().where(URL.id == AutomatedUrlAgencySuggestion.url_id), 1 + ), + else_=0 + ).label("agency_annotations_exist") + ).subquery() + ) + + # Count whether agency user annotations exist + agency_user_annotations_exist_subquery = ( + select( + URL.id, + case( + ( + exists().where(URL.id == UserUrlAgencySuggestion.url_id), 1 + ), + else_=0 + ).label("agency_user_annotations_exist") + ).subquery() + ) + + # Subqueries for counting *all* annotations + + # Count all auto annotations for metadata + all_auto_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(URLMetadata.attribute).label("auto_count") + ).group_by(URLMetadata.url_id).subquery() + ) + # Count all user annotations for metadata + all_user_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(URLMetadata.attribute).label("user_count") + ).join(MetadataAnnotation). + where(MetadataAnnotation.user_id != None). + group_by(URLMetadata.url_id).subquery() + ) + + # Count all user agency annotations + all_user_agency_annotations_subquery = ( + select( + UserUrlAgencySuggestion.url_id, + func.count(UserUrlAgencySuggestion.agency_id).label("user_count") + ).group_by(UserUrlAgencySuggestion.url_id).subquery() + ) + + + + # Basic URL query + url_query = ( + select( + URL, + ( + func.coalesce(distinct_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(distinct_user_metadata_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + + func.coalesce(agency_user_annotations_exist_subquery.c.agency_user_annotations_exist, 0) + ).label("total_distinct_annotation_count"), + ( + func.coalesce(all_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(all_user_metadata_subquery.c.user_count, 0) + + func.coalesce(all_user_agency_annotations_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + ).label("total_overall_annotation_count") + ).outerjoin( + distinct_auto_metadata_subquery, URL.id == distinct_auto_metadata_subquery.c.url_id + ).outerjoin( + distinct_user_metadata_subquery, URL.id == distinct_user_metadata_subquery.c.url_id + ).outerjoin( + agency_annotations_exist_subquery, URL.id == agency_annotations_exist_subquery.c.id + ).outerjoin( + agency_user_annotations_exist_subquery, URL.id == agency_user_annotations_exist_subquery.c.id + ).outerjoin( + all_auto_metadata_subquery, URL.id == all_auto_metadata_subquery.c.url_id + ).outerjoin( + all_user_metadata_subquery, URL.id == all_user_metadata_subquery.c.url_id + ).outerjoin( + all_user_agency_annotations_subquery, URL.id == all_user_agency_annotations_subquery.c.url_id + ) + ) + options = [ + joinedload(URL.html_content), + joinedload(URL.url_metadata).joinedload(URLMetadata.annotations), + joinedload(URL.automated_agency_suggestions).joinedload(AutomatedUrlAgencySuggestion.agency), + joinedload(URL.user_agency_suggestions).joinedload(UserUrlAgencySuggestion.agency), + joinedload(URL.confirmed_agencies).joinedload(ConfirmedUrlAgency.agency), + ] + + # Apply options + url_query = url_query.options(*options) + + # Apply order clause + url_query = url_query.order_by( + desc("total_distinct_annotation_count"), + desc("total_overall_annotation_count"), + ) + + + # Apply limit + url_query = url_query.limit(1) + + # Execute query + raw_result = await session.execute(url_query) + full_result = raw_result.unique().all()[0] + result: URL = full_result[0] + + # Convert html content to response format + html_content = result.html_content + html_content_infos = [URLHTMLContentInfo(**html_info.__dict__) for html_info in html_content] + + automated_agency_suggestions = result.automated_agency_suggestions + user_agency_suggestions = result.user_agency_suggestions + confirmed_agencies = result.confirmed_agencies + url_metadatas = result.url_metadata + + # Return + return GetNextURLForFinalReviewResponse( + id=result.id, + url=result.url, + html_info=convert_to_response_html_info(html_content_infos), + annotations=FinalReviewAnnotationInfo( + relevant=DTOConverter.final_review_annotation_relevant_info(url_metadatas), + record_type=DTOConverter.final_review_annotation_record_type_info(url_metadatas), + agency=DTOConverter.final_review_annotation_agency_info( + automated_agency_suggestions=automated_agency_suggestions, + confirmed_agencies=confirmed_agencies, + user_agency_suggestions=user_agency_suggestions + ) + ) + ) + diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py new file mode 100644 index 00000000..63c88d92 --- /dev/null +++ b/collector_db/DTOConverter.py @@ -0,0 +1,198 @@ +from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType +from collector_db.models import URLMetadata, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, \ + MetadataAnnotation +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo +from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ + FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ + FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyUserInfo +from core.enums import RecordType, SuggestionType + + +def get_url_metadata( + url_metadatas: list[URLMetadata], + validation_status: ValidationStatus, + validation_source: ValidationSource, + attribute: URLMetadataAttributeType +): + for url_metadata in url_metadatas: + if url_metadata.validation_status != validation_status.value: + continue + if url_metadata.validation_source != validation_source.value: + continue + if url_metadata.attribute != attribute.value: + continue + return url_metadata + + + +class DTOConverter: + + """ + Converts SQLAlchemy objects to DTOs + """ + + @staticmethod + def final_review_annotation_relevant_info( + url_metadatas: list[URLMetadata] + ) -> FinalReviewAnnotationRelevantInfo: + relevant_metadata = get_url_metadata( + url_metadatas=url_metadatas, + validation_status=ValidationStatus.PENDING_VALIDATION, + validation_source=ValidationSource.MACHINE_LEARNING, + attribute=URLMetadataAttributeType.RELEVANT + ) + auto_value = relevant_metadata.value if relevant_metadata else None + if auto_value is not None: + auto_value = (auto_value == "True") + + + annotations: list[MetadataAnnotation] = relevant_metadata.annotations if relevant_metadata else [] + relevant_count = 0 + not_relevant_count = 0 + for annotation in annotations: + if annotation.value == "True": + relevant_count += 1 + else: + not_relevant_count += 1 + return FinalReviewAnnotationRelevantInfo( + auto=auto_value, + users=FinalReviewAnnotationRelevantUsersInfo( + relevant=relevant_count, + not_relevant=not_relevant_count + ) + ) + + @staticmethod + def final_review_annotation_record_type_info( + url_metadata: list[URLMetadata] + ): + record_type_metadata = get_url_metadata( + url_metadatas=url_metadata, + validation_status=ValidationStatus.PENDING_VALIDATION, + validation_source=ValidationSource.MACHINE_LEARNING, + attribute=URLMetadataAttributeType.RECORD_TYPE + ) + user_count = {} + if record_type_metadata is None: + auto_value = None + annotations = [] + else: + auto_value = RecordType(record_type_metadata.value) + annotations = record_type_metadata.annotations + for annotation in annotations: + value = RecordType(annotation.value) + if value not in user_count: + user_count[value] = 0 + user_count[value] += 1 + # Sort users by count, descending + user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) + + return FinalReviewAnnotationRecordTypeInfo( + auto=auto_value, + users=user_count + ) + + @staticmethod + def final_review_annotation_agency_auto_info( + automated_agency_suggestions: list[AutomatedUrlAgencySuggestion] + ) -> FinalReviewAnnotationAgencyAutoInfo: + + if len(automated_agency_suggestions) == 0: + return FinalReviewAnnotationAgencyAutoInfo( + unknown=True, + suggestions=[] + ) + + + if len(automated_agency_suggestions) == 1: + suggestion = automated_agency_suggestions[0] + unknown = suggestion.is_unknown + else: + unknown = False + + if unknown: + return FinalReviewAnnotationAgencyAutoInfo( + unknown=True, + suggestions=[ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + ) + + return FinalReviewAnnotationAgencyAutoInfo( + unknown=unknown, + suggestions=[ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=suggestion.agency_id, + agency_name=suggestion.agency.name, + state=suggestion.agency.state, + county=suggestion.agency.county, + locality=suggestion.agency.locality + ) for suggestion in automated_agency_suggestions + ] + ) + + @staticmethod + def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_url_agency_suggestions: list[UserUrlAgencySuggestion] + ) -> dict[int, FinalReviewAnnotationAgencyUserInfo]: + d = {} + for suggestion in user_url_agency_suggestions: + agency = suggestion.agency + agency_id = agency.agency_id + if agency.agency_id not in d: + d[agency_id] = FinalReviewAnnotationAgencyUserInfo( + suggestion_type=SuggestionType.MANUAL_SUGGESTION, + agency_name=agency.name, + pdap_agency_id=agency_id, + state=agency.state, + county=agency.county, + locality=agency.locality, + count=1 + ) + else: + d[agency_id].count += 1 + + # Return sorted + return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) + + + @staticmethod + def final_review_annotation_agency_info( + automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], + confirmed_agencies: list[ConfirmedUrlAgency], + user_agency_suggestions: list[UserUrlAgencySuggestion] + ): + if len(confirmed_agencies) == 1: + confirmed_agency = confirmed_agencies[0] + confirmed_agency_info = GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=confirmed_agency.agency_id, + agency_name=confirmed_agency.agency.name, + state=confirmed_agency.agency.state, + county=confirmed_agency.agency.county, + locality=confirmed_agency.agency.locality + ) + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + users=None, + auto=None + ) + + + agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + automated_agency_suggestions + ) + + agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestions + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=None, + users=agency_user_info, + auto=agency_auto_info + ) + diff --git a/collector_db/models.py b/collector_db/models.py index ee43f35b..f462198c 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -111,7 +111,7 @@ class URLMetadata(Base): name="uq_url_id_attribute"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) attribute = Column( PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), @@ -143,7 +143,7 @@ class MetadataAnnotation(Base): name="metadata_annotations_uq_user_id_metadata_id"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) user_id = Column(Integer, nullable=False) metadata_id = Column(Integer, ForeignKey('url_metadata.id'), nullable=False) value = Column(Text, nullable=False) @@ -193,7 +193,7 @@ class URLHTMLContent(Base): name="uq_url_id_content_type"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) content_type = Column( PGEnum('Title', 'Description', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'Div', name='url_html_content_type'), diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 808821f7..d2085540 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -180,7 +180,7 @@ async def submit_url_annotation( annotation: str, metadata_type: URLMetadataAttributeType ) -> GetNextURLForAnnotationResponse: - await self.adb_client.add_relevance_annotation( + await self.adb_client.add_metadata_annotation( user_id=user_id, metadata_id=metadata_id, annotation=annotation) @@ -220,3 +220,6 @@ async def submit_url_agency_annotation( agency_id=agency_suggestion_id, is_new=agency_post_info.is_new, ) + + async def get_next_source_for_review(self): + return await self.adb_client.get_next_url_for_final_review() diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py new file mode 100644 index 00000000..8a7077a1 --- /dev/null +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -0,0 +1,64 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo +from core.enums import RecordType +from html_tag_collector.DataClassTags import ResponseHTMLInfo + +# Todo: Add descriptions + +class FinalReviewAnnotationRelevantUsersInfo(BaseModel): + relevant: int = Field(title="Number of users who marked the URL as relevant") + not_relevant: int = Field(title="Number of users who marked the URL as not relevant") + +class FinalReviewAnnotationRelevantInfo(BaseModel): + auto: Optional[bool] = Field(title="Whether the auto-labeler has marked the URL as relevant") + users: FinalReviewAnnotationRelevantUsersInfo = Field( + title="How users identified the relevancy of the source", + ) + +class FinalReviewAnnotationRecordTypeInfo(BaseModel): + auto: Optional[RecordType] = Field(title="The record type suggested by the auto-labeler") + users: dict[RecordType, int] = Field( + title="A dictionary, sorted by size and omitting zero values, of all record types suggested by users", + ) + +class FinalReviewAnnotationAgencyUserInfo(GetNextURLForAgencyAgencyInfo): + count: int = Field(title="Number of times suggested by users") + +class FinalReviewAnnotationAgencyAutoInfo(BaseModel): + unknown: bool = Field(title="Whether the auto-labeler suggested the URL as unknown") + suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( + title="A list of agencies, if any, suggested by the auto-labeler", + ) + +class FinalReviewAnnotationAgencyInfo(BaseModel): + confirmed: Optional[GetNextURLForAgencyAgencyInfo] = Field( + title="The confirmed agency for the URL", + ) + auto: Optional[FinalReviewAnnotationAgencyAutoInfo] = Field( + title="A single agency or a list of agencies suggested by the auto-labeler",) + users: Optional[dict[int, FinalReviewAnnotationAgencyUserInfo]] = Field( + title="A list, sorted by size, of all agencies suggested by users", + ) + +class FinalReviewAnnotationInfo(BaseModel): + relevant: FinalReviewAnnotationRelevantInfo = Field( + title="User and auto annotations for relevancy", + ) + record_type: FinalReviewAnnotationRecordTypeInfo = Field( + title="User and auto annotations for record type", + ) + agency: FinalReviewAnnotationAgencyInfo = Field( + title="User and auto annotations for agency", + ) + +class GetNextURLForFinalReviewResponse(BaseModel): + id: int = Field(title="The id of the URL") + url: str = Field(title="The URL") + html_info: ResponseHTMLInfo = Field(title="The HTML content of the URL") + annotations: FinalReviewAnnotationInfo = Field( + title="The annotations for the URL, from both users and the auto-labeler", + ) \ No newline at end of file diff --git a/security_manager/SecurityManager.py b/security_manager/SecurityManager.py index 18bc6a26..8d80f46c 100644 --- a/security_manager/SecurityManager.py +++ b/security_manager/SecurityManager.py @@ -39,6 +39,7 @@ def __init__( def validate_token(self, token: str) -> AccessInfo: try: payload = jwt.decode(token, self.secret_key, algorithms=[ALGORITHM]) + print(payload) return self.payload_to_access_info(payload) except InvalidTokenError as e: raise HTTPException( diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index d550e801..86efe510 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -186,9 +186,10 @@ async def metadata( value: str = "False", validation_status: ValidationStatus = ValidationStatus.PENDING_VALIDATION, validation_source: ValidationSource = ValidationSource.MACHINE_LEARNING - ): + ) -> list[int]: + metadata_ids = [] for url_id in url_ids: - await self.adb_client.add_url_metadata( + metadata_id = await self.adb_client.add_url_metadata( URLMetadataInfo( url_id=url_id, attribute=attribute, @@ -197,6 +198,8 @@ async def metadata( validation_source=validation_source, ) ) + metadata_ids.append(metadata_id) + return metadata_ids async def error_info( self, @@ -215,3 +218,71 @@ async def error_info( error_infos.append(url_error_info) await self.adb_client.add_url_error_infos(error_infos) + async def user_annotation( + self, + metadata_id: int, + user_id: Optional[int] = None, + annotation: str = "test annotation" + ): + if user_id is None: + user_id = randint(1, 99999999) + await self.adb_client.add_metadata_annotation( + user_id=user_id, + metadata_id=metadata_id, + annotation=annotation + ) + + async def agency_auto_suggestions( + self, + url_id: int, + count: int, + suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION + ): + if suggestion_type == SuggestionType.UNKNOWN: + count = 1 # Can only be one auto suggestion if unknown + + await self.adb_client.add_agency_auto_suggestions( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=suggestion_type, + pdap_agency_id=None if suggestion_type == SuggestionType.UNKNOWN else await self.agency(), + state="Test State", + county="Test County", + locality="Test Locality" + ) for _ in range(count) + ] + ) + + async def agency_confirmed_suggestion( + self, + url_id: int + ): + + await self.adb_client.add_confirmed_agency_url_links( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=await self.agency() + ) + ] + ) + + async def agency_user_suggestions( + self, + url_id: int, + user_id: Optional[int] = None, + agency_id: Optional[int] = None + ): + if user_id is None: + user_id = randint(1, 99999999) + + if agency_id is None: + agency_id = await self.agency() + await self.adb_client.add_agency_manual_suggestion( + agency_id=agency_id, + url_id=url_id, + user_id=user_id, + is_new=False + ) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index fa3b7110..9444bc79 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -10,7 +10,7 @@ from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource from collector_manager.enums import URLStatus -from core.enums import BatchStatus +from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator @@ -194,3 +194,216 @@ async def test_get_urls_with_metadata(db_data_creator: DBDataCreator): ) assert len(results) == 1 +async def setup_for_get_next_url_for_final_review( + db_data_creator: DBDataCreator, + annotation_count: int, + include_user_annotations: bool = True +): + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + await db_data_creator.html_data([url_mapping.url_id]) + + async def add_metadata_annotation(count: int, value: str, metadata_id: int): + for i in range(count): + await db_data_creator.user_annotation( + metadata_id=metadata_id, + annotation=value + ) + + async def add_user_suggestion(count: int): + agency_id = await db_data_creator.agency() + for i in range(count): + await db_data_creator.agency_user_suggestions( + url_id=url_mapping.url_id, + agency_id=agency_id + ) + + relevant_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RELEVANT, + value="True", + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + relevant_metadata_id = relevant_metadata_ids[0] + record_type_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RECORD_TYPE, + value=RecordType.ARREST_RECORDS.value, + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + record_type_metadata_id = record_type_metadata_ids[0] + + if include_user_annotations: + await add_metadata_annotation(annotation_count, "True", relevant_metadata_id) + await add_metadata_annotation(1, "False", relevant_metadata_id) + await add_metadata_annotation(3, RecordType.ARREST_RECORDS.value, record_type_metadata_id) + await add_metadata_annotation(2, RecordType.DISPATCH_RECORDINGS.value, record_type_metadata_id) + await add_metadata_annotation(1, RecordType.ACCIDENT_REPORTS.value, record_type_metadata_id) + + if include_user_annotations: + # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 + for i in range(annotation_count): + await add_user_suggestion(i + 1) + + + return url_mapping + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreator): + """ + Test that an annotated URL is returned + """ + + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + await db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3 + ) + + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.url == url_mapping.url + html_info = result.html_info + assert html_info.description == "test description" + assert html_info.title == "test html content" + + annotation_info = result.annotations + relevant_info = annotation_info.relevant + assert relevant_info.auto == True + assert relevant_info.users.relevant == 3 + assert relevant_info.users.not_relevant == 1 + + record_type_info = annotation_info.record_type + assert record_type_info.auto == RecordType.ARREST_RECORDS + user_d = record_type_info.users + assert user_d[RecordType.ARREST_RECORDS] == 3 + assert user_d[RecordType.DISPATCH_RECORDINGS] == 2 + assert user_d[RecordType.ACCIDENT_REPORTS] == 1 + assert list(user_d.keys()) == [RecordType.ARREST_RECORDS, RecordType.DISPATCH_RECORDINGS, RecordType.ACCIDENT_REPORTS] + + + agency_info = annotation_info.agency + auto_agency_suggestions = agency_info.auto + assert auto_agency_suggestions.unknown == False + assert len(auto_agency_suggestions.suggestions) == 3 + + # Check user agency suggestions exist and in descending order of count + user_agency_suggestions = agency_info.users + user_agency_suggestions_as_list = list(user_agency_suggestions.values()) + assert len(user_agency_suggestions_as_list) == 3 + for i in range(3): + assert user_agency_suggestions_as_list[i].count == 3 - i + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_favor_more_components(db_data_creator: DBDataCreator): + """ + Test in the case of two URLs, favoring the one with more annotations for more components + i.e., if one has annotations for record type and agency id, that should be favored over one with just record type + """ + + url_mapping_without_user_anno = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=False + ) + + url_mapping_with_user_anno = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + # Have both be listed as unknown + + for url_mapping in [url_mapping_with_user_anno, url_mapping_without_user_anno]: + await db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3, + suggestion_type=SuggestionType.UNKNOWN + ) + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping_with_user_anno.url_id + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_favor_more_annotations(db_data_creator: DBDataCreator): + """ + Test in the case of two URLs with the same number of components annotated, favoring the one with more total annotations + """ + url_mapping_lower_count = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=1, + include_user_annotations=True + ) + + url_mapping_higher_count = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + for url_mapping in [url_mapping_lower_count, url_mapping_higher_count]: + await db_data_creator.agency_confirmed_suggestion( + url_id=url_mapping.url_id + ) + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping_higher_count.url_id + + assert result.annotations.agency.confirmed is not None + + # TODO: Check that the the confirmed agency is shown for the result + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBDataCreator): + """ + Test in the case of one URL with no annotations. + Should be returned if it is the only one available. + """ + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping.url_id + + annotations = result.annotations + + agency = annotations.agency + assert agency.confirmed is None + assert agency.auto.unknown is True + assert agency.auto.suggestions == [] + + record_type = annotations.record_type + assert record_type.auto is None + assert record_type.users == {} + + relevant = annotations.relevant + assert relevant.auto is None + assert relevant.users.relevant == 0 + assert relevant.users.not_relevant == 0 + + +async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator: DBDataCreator): + """ + Test in the case of one URL that is confirmed + Should not be returned. + """ + batch_id = db_data_creator.batch() \ No newline at end of file From e3b467e9f3b628f54deac1b4d81effc8690c4ca1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Feb 2025 08:55:36 -0500 Subject: [PATCH 052/182] feat(api): add review get next source endpoint --- ENV.md | 10 +- api/main.py | 3 + api/routes/review.py | 19 ++ collector_db/AsyncDatabaseClient.py | 198 +++++++++++++++- collector_db/DTOConverter.py | 198 ++++++++++++++++ collector_db/models.py | 6 +- core/AsyncCore.py | 5 +- core/DTOs/GetNextURLForFinalReviewResponse.py | 64 ++++++ security_manager/SecurityManager.py | 1 + tests/helpers/DBDataCreator.py | 75 +++++- .../collector_db/test_db_client.py | 215 +++++++++++++++++- 11 files changed, 770 insertions(+), 24 deletions(-) create mode 100644 api/routes/review.py create mode 100644 collector_db/DTOConverter.py create mode 100644 core/DTOs/GetNextURLForFinalReviewResponse.py diff --git a/ENV.md b/ENV.md index 8fd30c33..68359348 100644 --- a/ENV.md +++ b/ENV.md @@ -14,11 +14,11 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | |`POSTGRES_HOST` | The host for the test database | `127.0.0.1` | |`POSTGRES_PORT` | The port for the test database | `5432` | -|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token that is used in the Data Sources App for encoding. | `abc123` | +|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | |`DEV`| Set to any value to run the application in development mode. | `true` | |`DEEPSEEK_API_KEY`| The API key required for accessing the DeepSeek API. | `abc123` | -|`OPENAI_API_KEY`| The API key required for accessing the OpenAI API.| `abc123` | -|`PDAP_EMAIL`| An email address for accessing the PDAP API.| `abc123@test.com` | -|`PDAP_PASSWORD`| A password for accessing the PDAP API.| `abc123` | -|`PDAP_API_KEY`| An API key for accessing the PDAP API.| `abc123` | +|`OPENAI_API_KEY`| The API key required for accessing the OpenAI API. | `abc123` | +|`PDAP_EMAIL`| An email address for accessing the PDAP API. | `abc123@test.com` | +|`PDAP_PASSWORD`| A password for accessing the PDAP API. | `abc123` | +|`PDAP_API_KEY`| An API key for accessing the PDAP API. | `abc123` | diff --git a/api/main.py b/api/main.py index 0a9c0249..34cebdd1 100644 --- a/api/main.py +++ b/api/main.py @@ -1,5 +1,6 @@ from contextlib import asynccontextmanager +import uvicorn from fastapi import FastAPI from api.routes.annotate import annotate_router @@ -84,3 +85,5 @@ async def setup_database(db_client): app.include_router(router) +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/api/routes/review.py b/api/routes/review.py new file mode 100644 index 00000000..f1b7210a --- /dev/null +++ b/api/routes/review.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter, Depends + +from api.dependencies import get_async_core +from core.AsyncCore import AsyncCore +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse +from security_manager.SecurityManager import AccessInfo, get_access_info + +review_router = APIRouter( + prefix="/review", + tags=["Review"], + responses={404: {"description": "Not found"}}, +) + +@review_router.get("/next-source") +async def get_next_source( + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), +) -> GetNextURLForFinalReviewResponse: + return await core.get_next_source_for_review() \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 2f657c54..62a44fbf 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,11 +1,12 @@ from functools import wraps from typing import Optional -from sqlalchemy import select, exists, func +from sqlalchemy import select, exists, func, distinct, case, desc, asc from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload, aliased +from sqlalchemy.orm import selectinload, aliased, joinedload from collector_db.ConfigManager import ConfigManager +from collector_db.DTOConverter import DTOConverter from collector_db.DTOs.MetadataAnnotationInfo import MetadataAnnotationInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo @@ -23,6 +24,8 @@ from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ + FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyInfo from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo @@ -47,10 +50,12 @@ def __init__(self, db_url: str = get_postgres_connection_string(is_async=True)): self.statement_composer = StatementComposer() @staticmethod - def _add_models(session: AsyncSession, model_class, models): - for model in models: - instance = model_class(**model.model_dump()) - session.add(instance) + async def _add_models(session: AsyncSession, model_class, models) -> list[int]: + instances = [model_class(**model.model_dump()) for model in models] + session.add_all(instances) + await session.flush() + return [instance.id for instance in instances] + @staticmethod @@ -86,12 +91,13 @@ async def get_url_metadata_by_status( return [URLMetadataInfo(**url_metadata.__dict__) for url_metadata in model_result] @session_manager - async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo): - self._add_models(session, URLMetadata, [url_metadata_info]) + async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo) -> int: + result = await self._add_models(session, URLMetadata, [url_metadata_info]) + return result[0] @session_manager - async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]): - self._add_models(session, URLMetadata, url_metadata_infos) + async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]) -> list[int]: + return await self._add_models(session, URLMetadata, url_metadata_infos) @session_manager async def add_url_error_infos(self, session: AsyncSession, url_error_infos: list[URLErrorPydanticInfo]): @@ -125,7 +131,7 @@ async def get_urls_with_errors(self, session: AsyncSession) -> list[URLErrorPyda @session_manager async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - self._add_models(session, URLHTMLContent, html_content_infos) + await self._add_models(session, URLHTMLContent, html_content_infos) @session_manager async def has_pending_urls_without_html_data(self, session: AsyncSession) -> bool: @@ -312,7 +318,7 @@ async def get_next_url_for_annotation( return annotation_info @session_manager - async def add_relevance_annotation( + async def add_metadata_annotation( self, session: AsyncSession, user_id: int, @@ -780,3 +786,171 @@ async def add_agency_manual_suggestion( is_new=is_new ) session.add(url_agency_suggestion) + + @session_manager + async def get_next_url_for_final_review(self, session: AsyncSession) -> GetNextURLForFinalReviewResponse: + + # Subqueries for ORDER clause + + # Subqueries for Counting distinct annotations + # Count distinct auto annotations for metadata + distinct_auto_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(distinct(URLMetadata.attribute)).label("auto_count") + ). + group_by(URLMetadata.url_id).subquery() + ) + # Count distinct user annotations for metadata + distinct_user_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(distinct(URLMetadata.attribute)).label("user_count") + ).join(MetadataAnnotation). + where(MetadataAnnotation.user_id != None). + group_by(URLMetadata.url_id).subquery() + ) + + + # Count whether agency auto annotations exist + # (Note: Can be either confirmed or auto suggestion) + agency_annotations_exist_subquery = ( + select( + URL.id, + case( + ( + exists().where(URL.id == ConfirmedUrlAgency.url_id), 1 + ), + ( + exists().where(URL.id == AutomatedUrlAgencySuggestion.url_id), 1 + ), + else_=0 + ).label("agency_annotations_exist") + ).subquery() + ) + + # Count whether agency user annotations exist + agency_user_annotations_exist_subquery = ( + select( + URL.id, + case( + ( + exists().where(URL.id == UserUrlAgencySuggestion.url_id), 1 + ), + else_=0 + ).label("agency_user_annotations_exist") + ).subquery() + ) + + # Subqueries for counting *all* annotations + + # Count all auto annotations for metadata + all_auto_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(URLMetadata.attribute).label("auto_count") + ).group_by(URLMetadata.url_id).subquery() + ) + # Count all user annotations for metadata + all_user_metadata_subquery = ( + select( + URLMetadata.url_id, + func.count(URLMetadata.attribute).label("user_count") + ).join(MetadataAnnotation). + where(MetadataAnnotation.user_id != None). + group_by(URLMetadata.url_id).subquery() + ) + + # Count all user agency annotations + all_user_agency_annotations_subquery = ( + select( + UserUrlAgencySuggestion.url_id, + func.count(UserUrlAgencySuggestion.agency_id).label("user_count") + ).group_by(UserUrlAgencySuggestion.url_id).subquery() + ) + + + + # Basic URL query + url_query = ( + select( + URL, + ( + func.coalesce(distinct_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(distinct_user_metadata_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + + func.coalesce(agency_user_annotations_exist_subquery.c.agency_user_annotations_exist, 0) + ).label("total_distinct_annotation_count"), + ( + func.coalesce(all_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(all_user_metadata_subquery.c.user_count, 0) + + func.coalesce(all_user_agency_annotations_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + ).label("total_overall_annotation_count") + ).outerjoin( + distinct_auto_metadata_subquery, URL.id == distinct_auto_metadata_subquery.c.url_id + ).outerjoin( + distinct_user_metadata_subquery, URL.id == distinct_user_metadata_subquery.c.url_id + ).outerjoin( + agency_annotations_exist_subquery, URL.id == agency_annotations_exist_subquery.c.id + ).outerjoin( + agency_user_annotations_exist_subquery, URL.id == agency_user_annotations_exist_subquery.c.id + ).outerjoin( + all_auto_metadata_subquery, URL.id == all_auto_metadata_subquery.c.url_id + ).outerjoin( + all_user_metadata_subquery, URL.id == all_user_metadata_subquery.c.url_id + ).outerjoin( + all_user_agency_annotations_subquery, URL.id == all_user_agency_annotations_subquery.c.url_id + ) + ) + options = [ + joinedload(URL.html_content), + joinedload(URL.url_metadata).joinedload(URLMetadata.annotations), + joinedload(URL.automated_agency_suggestions).joinedload(AutomatedUrlAgencySuggestion.agency), + joinedload(URL.user_agency_suggestions).joinedload(UserUrlAgencySuggestion.agency), + joinedload(URL.confirmed_agencies).joinedload(ConfirmedUrlAgency.agency), + ] + + # Apply options + url_query = url_query.options(*options) + + # Apply order clause + url_query = url_query.order_by( + desc("total_distinct_annotation_count"), + desc("total_overall_annotation_count"), + ) + + + # Apply limit + url_query = url_query.limit(1) + + # Execute query + raw_result = await session.execute(url_query) + full_result = raw_result.unique().all()[0] + result: URL = full_result[0] + + # Convert html content to response format + html_content = result.html_content + html_content_infos = [URLHTMLContentInfo(**html_info.__dict__) for html_info in html_content] + + automated_agency_suggestions = result.automated_agency_suggestions + user_agency_suggestions = result.user_agency_suggestions + confirmed_agencies = result.confirmed_agencies + url_metadatas = result.url_metadata + + # Return + return GetNextURLForFinalReviewResponse( + id=result.id, + url=result.url, + html_info=convert_to_response_html_info(html_content_infos), + annotations=FinalReviewAnnotationInfo( + relevant=DTOConverter.final_review_annotation_relevant_info(url_metadatas), + record_type=DTOConverter.final_review_annotation_record_type_info(url_metadatas), + agency=DTOConverter.final_review_annotation_agency_info( + automated_agency_suggestions=automated_agency_suggestions, + confirmed_agencies=confirmed_agencies, + user_agency_suggestions=user_agency_suggestions + ) + ) + ) + diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py new file mode 100644 index 00000000..63c88d92 --- /dev/null +++ b/collector_db/DTOConverter.py @@ -0,0 +1,198 @@ +from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType +from collector_db.models import URLMetadata, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, \ + MetadataAnnotation +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo +from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ + FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ + FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyUserInfo +from core.enums import RecordType, SuggestionType + + +def get_url_metadata( + url_metadatas: list[URLMetadata], + validation_status: ValidationStatus, + validation_source: ValidationSource, + attribute: URLMetadataAttributeType +): + for url_metadata in url_metadatas: + if url_metadata.validation_status != validation_status.value: + continue + if url_metadata.validation_source != validation_source.value: + continue + if url_metadata.attribute != attribute.value: + continue + return url_metadata + + + +class DTOConverter: + + """ + Converts SQLAlchemy objects to DTOs + """ + + @staticmethod + def final_review_annotation_relevant_info( + url_metadatas: list[URLMetadata] + ) -> FinalReviewAnnotationRelevantInfo: + relevant_metadata = get_url_metadata( + url_metadatas=url_metadatas, + validation_status=ValidationStatus.PENDING_VALIDATION, + validation_source=ValidationSource.MACHINE_LEARNING, + attribute=URLMetadataAttributeType.RELEVANT + ) + auto_value = relevant_metadata.value if relevant_metadata else None + if auto_value is not None: + auto_value = (auto_value == "True") + + + annotations: list[MetadataAnnotation] = relevant_metadata.annotations if relevant_metadata else [] + relevant_count = 0 + not_relevant_count = 0 + for annotation in annotations: + if annotation.value == "True": + relevant_count += 1 + else: + not_relevant_count += 1 + return FinalReviewAnnotationRelevantInfo( + auto=auto_value, + users=FinalReviewAnnotationRelevantUsersInfo( + relevant=relevant_count, + not_relevant=not_relevant_count + ) + ) + + @staticmethod + def final_review_annotation_record_type_info( + url_metadata: list[URLMetadata] + ): + record_type_metadata = get_url_metadata( + url_metadatas=url_metadata, + validation_status=ValidationStatus.PENDING_VALIDATION, + validation_source=ValidationSource.MACHINE_LEARNING, + attribute=URLMetadataAttributeType.RECORD_TYPE + ) + user_count = {} + if record_type_metadata is None: + auto_value = None + annotations = [] + else: + auto_value = RecordType(record_type_metadata.value) + annotations = record_type_metadata.annotations + for annotation in annotations: + value = RecordType(annotation.value) + if value not in user_count: + user_count[value] = 0 + user_count[value] += 1 + # Sort users by count, descending + user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) + + return FinalReviewAnnotationRecordTypeInfo( + auto=auto_value, + users=user_count + ) + + @staticmethod + def final_review_annotation_agency_auto_info( + automated_agency_suggestions: list[AutomatedUrlAgencySuggestion] + ) -> FinalReviewAnnotationAgencyAutoInfo: + + if len(automated_agency_suggestions) == 0: + return FinalReviewAnnotationAgencyAutoInfo( + unknown=True, + suggestions=[] + ) + + + if len(automated_agency_suggestions) == 1: + suggestion = automated_agency_suggestions[0] + unknown = suggestion.is_unknown + else: + unknown = False + + if unknown: + return FinalReviewAnnotationAgencyAutoInfo( + unknown=True, + suggestions=[ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + ) + + return FinalReviewAnnotationAgencyAutoInfo( + unknown=unknown, + suggestions=[ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=suggestion.agency_id, + agency_name=suggestion.agency.name, + state=suggestion.agency.state, + county=suggestion.agency.county, + locality=suggestion.agency.locality + ) for suggestion in automated_agency_suggestions + ] + ) + + @staticmethod + def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_url_agency_suggestions: list[UserUrlAgencySuggestion] + ) -> dict[int, FinalReviewAnnotationAgencyUserInfo]: + d = {} + for suggestion in user_url_agency_suggestions: + agency = suggestion.agency + agency_id = agency.agency_id + if agency.agency_id not in d: + d[agency_id] = FinalReviewAnnotationAgencyUserInfo( + suggestion_type=SuggestionType.MANUAL_SUGGESTION, + agency_name=agency.name, + pdap_agency_id=agency_id, + state=agency.state, + county=agency.county, + locality=agency.locality, + count=1 + ) + else: + d[agency_id].count += 1 + + # Return sorted + return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) + + + @staticmethod + def final_review_annotation_agency_info( + automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], + confirmed_agencies: list[ConfirmedUrlAgency], + user_agency_suggestions: list[UserUrlAgencySuggestion] + ): + if len(confirmed_agencies) == 1: + confirmed_agency = confirmed_agencies[0] + confirmed_agency_info = GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=confirmed_agency.agency_id, + agency_name=confirmed_agency.agency.name, + state=confirmed_agency.agency.state, + county=confirmed_agency.agency.county, + locality=confirmed_agency.agency.locality + ) + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + users=None, + auto=None + ) + + + agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + automated_agency_suggestions + ) + + agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestions + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=None, + users=agency_user_info, + auto=agency_auto_info + ) + diff --git a/collector_db/models.py b/collector_db/models.py index ee43f35b..f462198c 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -111,7 +111,7 @@ class URLMetadata(Base): name="uq_url_id_attribute"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) attribute = Column( PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), @@ -143,7 +143,7 @@ class MetadataAnnotation(Base): name="metadata_annotations_uq_user_id_metadata_id"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) user_id = Column(Integer, nullable=False) metadata_id = Column(Integer, ForeignKey('url_metadata.id'), nullable=False) value = Column(Text, nullable=False) @@ -193,7 +193,7 @@ class URLHTMLContent(Base): name="uq_url_id_content_type"), ) - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) content_type = Column( PGEnum('Title', 'Description', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'Div', name='url_html_content_type'), diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 808821f7..d2085540 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -180,7 +180,7 @@ async def submit_url_annotation( annotation: str, metadata_type: URLMetadataAttributeType ) -> GetNextURLForAnnotationResponse: - await self.adb_client.add_relevance_annotation( + await self.adb_client.add_metadata_annotation( user_id=user_id, metadata_id=metadata_id, annotation=annotation) @@ -220,3 +220,6 @@ async def submit_url_agency_annotation( agency_id=agency_suggestion_id, is_new=agency_post_info.is_new, ) + + async def get_next_source_for_review(self): + return await self.adb_client.get_next_url_for_final_review() diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py new file mode 100644 index 00000000..8a7077a1 --- /dev/null +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -0,0 +1,64 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo +from core.enums import RecordType +from html_tag_collector.DataClassTags import ResponseHTMLInfo + +# Todo: Add descriptions + +class FinalReviewAnnotationRelevantUsersInfo(BaseModel): + relevant: int = Field(title="Number of users who marked the URL as relevant") + not_relevant: int = Field(title="Number of users who marked the URL as not relevant") + +class FinalReviewAnnotationRelevantInfo(BaseModel): + auto: Optional[bool] = Field(title="Whether the auto-labeler has marked the URL as relevant") + users: FinalReviewAnnotationRelevantUsersInfo = Field( + title="How users identified the relevancy of the source", + ) + +class FinalReviewAnnotationRecordTypeInfo(BaseModel): + auto: Optional[RecordType] = Field(title="The record type suggested by the auto-labeler") + users: dict[RecordType, int] = Field( + title="A dictionary, sorted by size and omitting zero values, of all record types suggested by users", + ) + +class FinalReviewAnnotationAgencyUserInfo(GetNextURLForAgencyAgencyInfo): + count: int = Field(title="Number of times suggested by users") + +class FinalReviewAnnotationAgencyAutoInfo(BaseModel): + unknown: bool = Field(title="Whether the auto-labeler suggested the URL as unknown") + suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( + title="A list of agencies, if any, suggested by the auto-labeler", + ) + +class FinalReviewAnnotationAgencyInfo(BaseModel): + confirmed: Optional[GetNextURLForAgencyAgencyInfo] = Field( + title="The confirmed agency for the URL", + ) + auto: Optional[FinalReviewAnnotationAgencyAutoInfo] = Field( + title="A single agency or a list of agencies suggested by the auto-labeler",) + users: Optional[dict[int, FinalReviewAnnotationAgencyUserInfo]] = Field( + title="A list, sorted by size, of all agencies suggested by users", + ) + +class FinalReviewAnnotationInfo(BaseModel): + relevant: FinalReviewAnnotationRelevantInfo = Field( + title="User and auto annotations for relevancy", + ) + record_type: FinalReviewAnnotationRecordTypeInfo = Field( + title="User and auto annotations for record type", + ) + agency: FinalReviewAnnotationAgencyInfo = Field( + title="User and auto annotations for agency", + ) + +class GetNextURLForFinalReviewResponse(BaseModel): + id: int = Field(title="The id of the URL") + url: str = Field(title="The URL") + html_info: ResponseHTMLInfo = Field(title="The HTML content of the URL") + annotations: FinalReviewAnnotationInfo = Field( + title="The annotations for the URL, from both users and the auto-labeler", + ) \ No newline at end of file diff --git a/security_manager/SecurityManager.py b/security_manager/SecurityManager.py index 18bc6a26..8d80f46c 100644 --- a/security_manager/SecurityManager.py +++ b/security_manager/SecurityManager.py @@ -39,6 +39,7 @@ def __init__( def validate_token(self, token: str) -> AccessInfo: try: payload = jwt.decode(token, self.secret_key, algorithms=[ALGORITHM]) + print(payload) return self.payload_to_access_info(payload) except InvalidTokenError as e: raise HTTPException( diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index d550e801..86efe510 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -186,9 +186,10 @@ async def metadata( value: str = "False", validation_status: ValidationStatus = ValidationStatus.PENDING_VALIDATION, validation_source: ValidationSource = ValidationSource.MACHINE_LEARNING - ): + ) -> list[int]: + metadata_ids = [] for url_id in url_ids: - await self.adb_client.add_url_metadata( + metadata_id = await self.adb_client.add_url_metadata( URLMetadataInfo( url_id=url_id, attribute=attribute, @@ -197,6 +198,8 @@ async def metadata( validation_source=validation_source, ) ) + metadata_ids.append(metadata_id) + return metadata_ids async def error_info( self, @@ -215,3 +218,71 @@ async def error_info( error_infos.append(url_error_info) await self.adb_client.add_url_error_infos(error_infos) + async def user_annotation( + self, + metadata_id: int, + user_id: Optional[int] = None, + annotation: str = "test annotation" + ): + if user_id is None: + user_id = randint(1, 99999999) + await self.adb_client.add_metadata_annotation( + user_id=user_id, + metadata_id=metadata_id, + annotation=annotation + ) + + async def agency_auto_suggestions( + self, + url_id: int, + count: int, + suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION + ): + if suggestion_type == SuggestionType.UNKNOWN: + count = 1 # Can only be one auto suggestion if unknown + + await self.adb_client.add_agency_auto_suggestions( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=suggestion_type, + pdap_agency_id=None if suggestion_type == SuggestionType.UNKNOWN else await self.agency(), + state="Test State", + county="Test County", + locality="Test Locality" + ) for _ in range(count) + ] + ) + + async def agency_confirmed_suggestion( + self, + url_id: int + ): + + await self.adb_client.add_confirmed_agency_url_links( + suggestions=[ + URLAgencySuggestionInfo( + url_id=url_id, + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=await self.agency() + ) + ] + ) + + async def agency_user_suggestions( + self, + url_id: int, + user_id: Optional[int] = None, + agency_id: Optional[int] = None + ): + if user_id is None: + user_id = randint(1, 99999999) + + if agency_id is None: + agency_id = await self.agency() + await self.adb_client.add_agency_manual_suggestion( + agency_id=agency_id, + url_id=url_id, + user_id=user_id, + is_new=False + ) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index fa3b7110..9444bc79 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -10,7 +10,7 @@ from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource from collector_manager.enums import URLStatus -from core.enums import BatchStatus +from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator @@ -194,3 +194,216 @@ async def test_get_urls_with_metadata(db_data_creator: DBDataCreator): ) assert len(results) == 1 +async def setup_for_get_next_url_for_final_review( + db_data_creator: DBDataCreator, + annotation_count: int, + include_user_annotations: bool = True +): + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + await db_data_creator.html_data([url_mapping.url_id]) + + async def add_metadata_annotation(count: int, value: str, metadata_id: int): + for i in range(count): + await db_data_creator.user_annotation( + metadata_id=metadata_id, + annotation=value + ) + + async def add_user_suggestion(count: int): + agency_id = await db_data_creator.agency() + for i in range(count): + await db_data_creator.agency_user_suggestions( + url_id=url_mapping.url_id, + agency_id=agency_id + ) + + relevant_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RELEVANT, + value="True", + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + relevant_metadata_id = relevant_metadata_ids[0] + record_type_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RECORD_TYPE, + value=RecordType.ARREST_RECORDS.value, + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + record_type_metadata_id = record_type_metadata_ids[0] + + if include_user_annotations: + await add_metadata_annotation(annotation_count, "True", relevant_metadata_id) + await add_metadata_annotation(1, "False", relevant_metadata_id) + await add_metadata_annotation(3, RecordType.ARREST_RECORDS.value, record_type_metadata_id) + await add_metadata_annotation(2, RecordType.DISPATCH_RECORDINGS.value, record_type_metadata_id) + await add_metadata_annotation(1, RecordType.ACCIDENT_REPORTS.value, record_type_metadata_id) + + if include_user_annotations: + # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 + for i in range(annotation_count): + await add_user_suggestion(i + 1) + + + return url_mapping + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreator): + """ + Test that an annotated URL is returned + """ + + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + await db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3 + ) + + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.url == url_mapping.url + html_info = result.html_info + assert html_info.description == "test description" + assert html_info.title == "test html content" + + annotation_info = result.annotations + relevant_info = annotation_info.relevant + assert relevant_info.auto == True + assert relevant_info.users.relevant == 3 + assert relevant_info.users.not_relevant == 1 + + record_type_info = annotation_info.record_type + assert record_type_info.auto == RecordType.ARREST_RECORDS + user_d = record_type_info.users + assert user_d[RecordType.ARREST_RECORDS] == 3 + assert user_d[RecordType.DISPATCH_RECORDINGS] == 2 + assert user_d[RecordType.ACCIDENT_REPORTS] == 1 + assert list(user_d.keys()) == [RecordType.ARREST_RECORDS, RecordType.DISPATCH_RECORDINGS, RecordType.ACCIDENT_REPORTS] + + + agency_info = annotation_info.agency + auto_agency_suggestions = agency_info.auto + assert auto_agency_suggestions.unknown == False + assert len(auto_agency_suggestions.suggestions) == 3 + + # Check user agency suggestions exist and in descending order of count + user_agency_suggestions = agency_info.users + user_agency_suggestions_as_list = list(user_agency_suggestions.values()) + assert len(user_agency_suggestions_as_list) == 3 + for i in range(3): + assert user_agency_suggestions_as_list[i].count == 3 - i + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_favor_more_components(db_data_creator: DBDataCreator): + """ + Test in the case of two URLs, favoring the one with more annotations for more components + i.e., if one has annotations for record type and agency id, that should be favored over one with just record type + """ + + url_mapping_without_user_anno = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=False + ) + + url_mapping_with_user_anno = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + # Have both be listed as unknown + + for url_mapping in [url_mapping_with_user_anno, url_mapping_without_user_anno]: + await db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3, + suggestion_type=SuggestionType.UNKNOWN + ) + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping_with_user_anno.url_id + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_favor_more_annotations(db_data_creator: DBDataCreator): + """ + Test in the case of two URLs with the same number of components annotated, favoring the one with more total annotations + """ + url_mapping_lower_count = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=1, + include_user_annotations=True + ) + + url_mapping_higher_count = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + for url_mapping in [url_mapping_lower_count, url_mapping_higher_count]: + await db_data_creator.agency_confirmed_suggestion( + url_id=url_mapping.url_id + ) + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping_higher_count.url_id + + assert result.annotations.agency.confirmed is not None + + # TODO: Check that the the confirmed agency is shown for the result + + + +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBDataCreator): + """ + Test in the case of one URL with no annotations. + Should be returned if it is the only one available. + """ + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result.id == url_mapping.url_id + + annotations = result.annotations + + agency = annotations.agency + assert agency.confirmed is None + assert agency.auto.unknown is True + assert agency.auto.suggestions == [] + + record_type = annotations.record_type + assert record_type.auto is None + assert record_type.users == {} + + relevant = annotations.relevant + assert relevant.auto is None + assert relevant.users.relevant == 0 + assert relevant.users.not_relevant == 0 + + +async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator: DBDataCreator): + """ + Test in the case of one URL that is confirmed + Should not be returned. + """ + batch_id = db_data_creator.batch() \ No newline at end of file From 96a98acbf103cd4be8cf75270632d0c1a5c449a4 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 21 Feb 2025 11:21:20 -0500 Subject: [PATCH 053/182] feat(api): add review get next source endpoint --- api/main.py | 5 +- collector_db/AsyncDatabaseClient.py | 15 +++- tests/helpers/DBDataCreator.py | 10 ++- tests/helpers/complex_test_data_functions.py | 60 +++++++++++++++ .../api/helpers/RequestValidator.py | 10 ++- .../integration/api/test_review.py | 55 ++++++++++++++ .../collector_db/test_db_client.py | 74 ++++--------------- 7 files changed, 162 insertions(+), 67 deletions(-) create mode 100644 tests/helpers/complex_test_data_functions.py create mode 100644 tests/test_automated/integration/api/test_review.py diff --git a/api/main.py b/api/main.py index 34cebdd1..8feaa165 100644 --- a/api/main.py +++ b/api/main.py @@ -6,6 +6,7 @@ from api.routes.annotate import annotate_router from api.routes.batch import batch_router from api.routes.collector import collector_router +from api.routes.review import review_router from api.routes.root import root_router from api.routes.task import task_router from api.routes.url import url_router @@ -79,8 +80,10 @@ async def setup_database(db_client): batch_router, annotate_router, url_router, - task_router + task_router, + review_router ] + for router in routers: app.include_router(router) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 62a44fbf..f6548dfa 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -788,7 +788,10 @@ async def add_agency_manual_suggestion( session.add(url_agency_suggestion) @session_manager - async def get_next_url_for_final_review(self, session: AsyncSession) -> GetNextURLForFinalReviewResponse: + async def get_next_url_for_final_review( + self, + session: AsyncSession + ) -> Optional[GetNextURLForFinalReviewResponse]: # Subqueries for ORDER clause @@ -901,6 +904,8 @@ async def get_next_url_for_final_review(self, session: AsyncSession) -> GetNextU all_user_metadata_subquery, URL.id == all_user_metadata_subquery.c.url_id ).outerjoin( all_user_agency_annotations_subquery, URL.id == all_user_agency_annotations_subquery.c.url_id + ).where( + URL.outcome == URLStatus.PENDING.value ) ) options = [ @@ -926,8 +931,12 @@ async def get_next_url_for_final_review(self, session: AsyncSession) -> GetNextU # Execute query raw_result = await session.execute(url_query) - full_result = raw_result.unique().all()[0] - result: URL = full_result[0] + + full_result = raw_result.unique().all() + + if len(full_result) == 0: + return None + result: URL = full_result[0][0] # Convert html content to response format html_content = result.html_content diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 86efe510..16f73602 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -13,7 +13,7 @@ from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DatabaseClient import DatabaseClient from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType -from collector_manager.enums import CollectorType +from collector_manager.enums import CollectorType, URLStatus from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.enums import BatchStatus, SuggestionType from tests.helpers.simple_test_data_functions import generate_test_urls @@ -130,13 +130,19 @@ async def manual_suggestion(self, user_id: int, url_id: int, is_new: bool = Fals ) - def urls(self, batch_id: int, url_count: int) -> InsertURLsInfo: + def urls( + self, + batch_id: int, + url_count: int, + outcome: URLStatus = URLStatus.PENDING + ) -> InsertURLsInfo: raw_urls = generate_test_urls(url_count) url_infos: List[URLInfo] = [] for url in raw_urls: url_infos.append( URLInfo( url=url, + outcome=outcome ) ) diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py new file mode 100644 index 00000000..57f4c412 --- /dev/null +++ b/tests/helpers/complex_test_data_functions.py @@ -0,0 +1,60 @@ +from collector_db.enums import URLMetadataAttributeType, ValidationSource, ValidationStatus +from core.enums import RecordType +from tests.helpers.DBDataCreator import DBDataCreator + + +async def setup_for_get_next_url_for_final_review( + db_data_creator: DBDataCreator, + annotation_count: int, + include_user_annotations: bool = True +): + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + await db_data_creator.html_data([url_mapping.url_id]) + + async def add_metadata_annotation(count: int, value: str, metadata_id: int): + for i in range(count): + await db_data_creator.user_annotation( + metadata_id=metadata_id, + annotation=value + ) + + async def add_user_suggestion(count: int): + agency_id = await db_data_creator.agency() + for i in range(count): + await db_data_creator.agency_user_suggestions( + url_id=url_mapping.url_id, + agency_id=agency_id + ) + + relevant_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RELEVANT, + value="True", + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + relevant_metadata_id = relevant_metadata_ids[0] + record_type_metadata_ids = await db_data_creator.metadata( + url_ids=[url_mapping.url_id], + attribute=URLMetadataAttributeType.RECORD_TYPE, + value=RecordType.ARREST_RECORDS.value, + validation_source=ValidationSource.MACHINE_LEARNING, + validation_status=ValidationStatus.PENDING_VALIDATION + ) + record_type_metadata_id = record_type_metadata_ids[0] + + if include_user_annotations: + await add_metadata_annotation(annotation_count, "True", relevant_metadata_id) + await add_metadata_annotation(1, "False", relevant_metadata_id) + await add_metadata_annotation(3, RecordType.ARREST_RECORDS.value, record_type_metadata_id) + await add_metadata_annotation(2, RecordType.DISPATCH_RECORDINGS.value, record_type_metadata_id) + await add_metadata_annotation(1, RecordType.ACCIDENT_REPORTS.value, record_type_metadata_id) + + if include_user_annotations: + # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 + for i in range(annotation_count): + await add_user_suggestion(i + 1) + + + return url_mapping diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 38b7cafd..d25ca424 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -15,6 +15,7 @@ from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo @@ -260,4 +261,11 @@ def get_tasks( url=f"/task", params=params ) - return GetTasksResponse(**data) \ No newline at end of file + return GetTasksResponse(**data) + + async def review_next_source(self) -> GetNextURLForFinalReviewResponse: + data = self.get( + url=f"/review/next-source" + ) + return GetNextURLForFinalReviewResponse(**data) + diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py new file mode 100644 index 00000000..a69f474a --- /dev/null +++ b/tests/test_automated/integration/api/test_review.py @@ -0,0 +1,55 @@ +import pytest + +from core.enums import RecordType +from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_review_next_source(api_test_helper): + ath = api_test_helper + + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + await ath.db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3 + ) + + result = await ath.request_validator.review_next_source() + + assert result.url == url_mapping.url + html_info = result.html_info + assert html_info.description == "test description" + assert html_info.title == "test html content" + + annotation_info = result.annotations + relevant_info = annotation_info.relevant + assert relevant_info.auto == True + assert relevant_info.users.relevant == 3 + assert relevant_info.users.not_relevant == 1 + + record_type_info = annotation_info.record_type + assert record_type_info.auto == RecordType.ARREST_RECORDS + user_d = record_type_info.users + assert user_d[RecordType.ARREST_RECORDS] == 3 + assert user_d[RecordType.DISPATCH_RECORDINGS] == 2 + assert user_d[RecordType.ACCIDENT_REPORTS] == 1 + assert list(user_d.keys()) == [RecordType.ARREST_RECORDS, RecordType.DISPATCH_RECORDINGS, RecordType.ACCIDENT_REPORTS] + + + agency_info = annotation_info.agency + auto_agency_suggestions = agency_info.auto + assert auto_agency_suggestions.unknown == False + assert len(auto_agency_suggestions.suggestions) == 3 + + # Check user agency suggestions exist and in descending order of count + user_agency_suggestions = agency_info.users + user_agency_suggestions_as_list = list(user_agency_suggestions.values()) + assert len(user_agency_suggestions_as_list) == 3 + for i in range(3): + assert user_agency_suggestions_as_list[i].count == 3 - i + diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 9444bc79..20b5c194 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -12,6 +12,7 @@ from collector_manager.enums import URLStatus from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator +from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review def test_insert_urls(db_client_test): @@ -194,62 +195,6 @@ async def test_get_urls_with_metadata(db_data_creator: DBDataCreator): ) assert len(results) == 1 -async def setup_for_get_next_url_for_final_review( - db_data_creator: DBDataCreator, - annotation_count: int, - include_user_annotations: bool = True -): - batch_id = db_data_creator.batch() - url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] - await db_data_creator.html_data([url_mapping.url_id]) - - async def add_metadata_annotation(count: int, value: str, metadata_id: int): - for i in range(count): - await db_data_creator.user_annotation( - metadata_id=metadata_id, - annotation=value - ) - - async def add_user_suggestion(count: int): - agency_id = await db_data_creator.agency() - for i in range(count): - await db_data_creator.agency_user_suggestions( - url_id=url_mapping.url_id, - agency_id=agency_id - ) - - relevant_metadata_ids = await db_data_creator.metadata( - url_ids=[url_mapping.url_id], - attribute=URLMetadataAttributeType.RELEVANT, - value="True", - validation_source=ValidationSource.MACHINE_LEARNING, - validation_status=ValidationStatus.PENDING_VALIDATION - ) - relevant_metadata_id = relevant_metadata_ids[0] - record_type_metadata_ids = await db_data_creator.metadata( - url_ids=[url_mapping.url_id], - attribute=URLMetadataAttributeType.RECORD_TYPE, - value=RecordType.ARREST_RECORDS.value, - validation_source=ValidationSource.MACHINE_LEARNING, - validation_status=ValidationStatus.PENDING_VALIDATION - ) - record_type_metadata_id = record_type_metadata_ids[0] - - if include_user_annotations: - await add_metadata_annotation(annotation_count, "True", relevant_metadata_id) - await add_metadata_annotation(1, "False", relevant_metadata_id) - await add_metadata_annotation(3, RecordType.ARREST_RECORDS.value, record_type_metadata_id) - await add_metadata_annotation(2, RecordType.DISPATCH_RECORDINGS.value, record_type_metadata_id) - await add_metadata_annotation(1, RecordType.ACCIDENT_REPORTS.value, record_type_metadata_id) - - if include_user_annotations: - # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 - for i in range(annotation_count): - await add_user_suggestion(i + 1) - - - return url_mapping - @pytest.mark.asyncio async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreator): @@ -367,7 +312,6 @@ async def test_get_next_url_for_final_review_favor_more_annotations(db_data_crea assert result.annotations.agency.confirmed is not None - # TODO: Check that the the confirmed agency is shown for the result @@ -400,10 +344,20 @@ async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBD assert relevant.users.relevant == 0 assert relevant.users.not_relevant == 0 - +@pytest.mark.asyncio async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator: DBDataCreator): """ - Test in the case of one URL that is confirmed + Test in the case of one URL that is submitted Should not be returned. """ - batch_id = db_data_creator.batch() \ No newline at end of file + batch_id = db_data_creator.batch() + url_mapping = db_data_creator.urls( + batch_id=batch_id, + url_count=1, + outcome=URLStatus.SUBMITTED + ).url_mappings[0] + + result = await db_data_creator.adb_client.get_next_url_for_final_review() + + assert result is None + From c833c2d093a6747e552547e63f61cf2c784666b4 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 21 Feb 2025 13:58:23 -0500 Subject: [PATCH 054/182] DRAFT --- ...cd_add_approved_enum_value_to_urlstatus.py | 70 +++++++++++ collector_db/AsyncDatabaseClient.py | 112 +++++++++++++++++- collector_manager/enums.py | 1 + core/AsyncCore.py | 3 + 4 files changed, 182 insertions(+), 4 deletions(-) create mode 100644 alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py diff --git a/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py b/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py new file mode 100644 index 00000000..a0ff537f --- /dev/null +++ b/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py @@ -0,0 +1,70 @@ +"""Add approved enum value to URLStatus + +Revision ID: 76f902fe18cd +Revises: d7eb670edaf0 +Create Date: 2025-02-21 13:46:00.621485 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '76f902fe18cd' +down_revision: Union[str, None] = 'd7eb670edaf0' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +old_enum_values = ('pending', 'submitted', 'human_labeling', 'rejected', 'duplicate', 'error') +new_enum_values = old_enum_values + ('approved',) + +old_outcome_enum = postgresql.ENUM( + *old_enum_values, name='url_status') + +tmp_new_outcome_enum = postgresql.ENUM( + *new_enum_values, + name='tmp_url_status' +) +new_outcome_enum = postgresql.ENUM( + *new_enum_values, + name='url_status' +) + +common_args = { + "table_name": "urls", + "column_name": "outcome", +} + +def upgrade() -> None: + op.alter_column( + **common_args, + existing_type=old_outcome_enum, + type_=tmp_new_outcome_enum + ) + old_outcome_enum.drop(op.get_bind(), checkfirst=True) + new_outcome_enum.create(op.get_bind()) + + op.alter_column( + **common_args, + existing_type=tmp_new_outcome_enum, + type_=new_outcome_enum + ) + tmp_new_outcome_enum.drop(op.get_bind(), checkfirst=True) + +def downgrade() -> None: + op.alter_column( + **common_args, + existing_type=new_outcome_enum, + type_=old_outcome_enum + ) + + new_outcome_enum.drop(op.get_bind(), checkfirst=True) + old_outcome_enum.create(op.get_bind()) + + op.alter_column( + **common_args, + existing_type=old_outcome_enum, + type_=new_outcome_enum + ) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index f6548dfa..896d18cf 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,7 +1,8 @@ from functools import wraps -from typing import Optional +from typing import Optional, List -from sqlalchemy import select, exists, func, distinct, case, desc, asc +from fastapi import HTTPException +from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, aliased, joinedload @@ -20,7 +21,7 @@ from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, \ - UserUrlAgencySuggestion + UserUrlAgencySuggestion, status from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo @@ -32,7 +33,7 @@ from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO -from core.enums import BatchStatus, SuggestionType +from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info @@ -963,3 +964,106 @@ async def get_next_url_for_final_review( ) ) + @session_manager + async def approve_url( + self, + session: AsyncSession, + url_id: int, + record_type: Optional[RecordType], + relevant: Optional[bool], + agency_id: Optional[int] + ) -> None: + + async def set_approved_metadata( + attribute: URLMetadataAttributeType, + value: Optional[str] + ): + selected_metadata = None + for metadata in metadatas: + if metadata.attribute == attribute.value: + selected_metadata = metadata + break + + # If metadata doesn't exist, create it + if selected_metadata is None: + # If a value was not provided for this metadata, raise an error. + if value is None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Must specify {attribute.value} value if URL does not already have a {attribute.value} metadata entry" + ) + + metadata_obj = URLMetadata( + attribute=attribute.value, + value=value, + validation_status=ValidationStatus.VALIDATED.value, + validation_source=ValidationSource.MANUAL.value, + url_id=url_id + ) + url.url_metadata.append(metadata_obj) + + else: + + # If value was provided, overwrite existing value. Otherwise, ignore + if value is not None: + selected_metadata.value = value + + # Mark metadata as validated + selected_metadata.validation_status = ValidationStatus.VALIDATED.value + selected_metadata.validation_source = ValidationSource.MANUAL.value + + + + # Get URL + + query = ( + Select(URL) + .where(URL.id == url_id) + .options( + selectinload(URL.url_metadata), + selectinload(URL.confirmed_agencies) + ) + ) + + url = await session.execute(query) + url = url.scalars().first() + + metadatas = url.url_metadata + + await set_approved_metadata( + attribute=URLMetadataAttributeType.RECORD_TYPE, + value=record_type + ) + + await set_approved_metadata( + attribute=URLMetadataAttributeType.RELEVANT, + value=relevant + ) + + # Check if agency_id exists as confirmed agency + confirmed_agency = url.confirmed_agencies + # If it doesn't, create it + if confirmed_agency is None: + confirmed_agency = ConfirmedUrlAgency( + agency_id=agency_id, + url_id=url_id + ) + url.confirmed_agencies.append(confirmed_agency) + + # If a different agency exists as confirmed, overwrite it + elif confirmed_agency.agency_id != agency_id: + confirmed_agency.agency_id = agency_id + + # If it does, do nothing + + url.outcome = URLStatus.APPROVED + + + + + # Confirm that URL has + # - a confirmed agency + # - a validated record_type + # - a validated relevant + + diff --git a/collector_manager/enums.py b/collector_manager/enums.py index 3820f274..b4289488 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -16,3 +16,4 @@ class URLStatus(Enum): REJECTED = "rejected" DUPLICATE = "duplicate" ERROR = "error" + APPROVED = "approved" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index d2085540..4ca4129c 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -223,3 +223,6 @@ async def submit_url_agency_annotation( async def get_next_source_for_review(self): return await self.adb_client.get_next_url_for_final_review() + + async def approve_and_get_next_source_for_review( + self, url_id: int): From 953206fcd3d9bed2250b112d9938738aa1fb275d Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 22 Feb 2025 15:11:09 -0500 Subject: [PATCH 055/182] DRAFT --- ...cd_add_approved_enum_value_to_urlstatus.py | 26 +++++++++++++------ collector_db/AsyncDatabaseClient.py | 14 ++-------- collector_db/models.py | 10 ++++++- core/AsyncCore.py | 12 ++++++++- core/DTOs/FinalReviewApprovalInfo.py | 26 +++++++++++++++++++ .../collector_db/test_db_client.py | 18 +++++++++++++ 6 files changed, 84 insertions(+), 22 deletions(-) create mode 100644 core/DTOs/FinalReviewApprovalInfo.py diff --git a/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py b/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py index a0ff537f..b548cc54 100644 --- a/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py +++ b/alembic/versions/76f902fe18cd_add_approved_enum_value_to_urlstatus.py @@ -21,7 +21,9 @@ new_enum_values = old_enum_values + ('approved',) old_outcome_enum = postgresql.ENUM( - *old_enum_values, name='url_status') + *old_enum_values, + name='url_status' +) tmp_new_outcome_enum = postgresql.ENUM( *new_enum_values, @@ -38,33 +40,41 @@ } def upgrade() -> None: + tmp_new_outcome_enum.create(op.get_bind(), checkfirst=True) op.alter_column( **common_args, existing_type=old_outcome_enum, - type_=tmp_new_outcome_enum + type_=tmp_new_outcome_enum, + postgresql_using='outcome::text::tmp_url_status' ) old_outcome_enum.drop(op.get_bind(), checkfirst=True) - new_outcome_enum.create(op.get_bind()) + new_outcome_enum.create(op.get_bind(), checkfirst=True) op.alter_column( **common_args, existing_type=tmp_new_outcome_enum, - type_=new_outcome_enum + type_=new_outcome_enum, + postgresql_using='outcome::text::url_status' ) tmp_new_outcome_enum.drop(op.get_bind(), checkfirst=True) def downgrade() -> None: + tmp_new_outcome_enum.create(op.get_bind()) op.alter_column( **common_args, existing_type=new_outcome_enum, - type_=old_outcome_enum + type_=tmp_new_outcome_enum, + postgresql_using='outcome::text::tmp_url_status' ) new_outcome_enum.drop(op.get_bind(), checkfirst=True) - old_outcome_enum.create(op.get_bind()) + old_outcome_enum.create(op.get_bind(), checkfirst=True) op.alter_column( **common_args, - existing_type=old_outcome_enum, - type_=new_outcome_enum + existing_type=tmp_new_outcome_enum, + type_=old_outcome_enum, + postgresql_using='outcome::text::url_status' ) + + tmp_new_outcome_enum.drop(op.get_bind(), checkfirst=True) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 896d18cf..fbf22f7c 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -5,6 +5,7 @@ from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, aliased, joinedload +from starlette import status from collector_db.ConfigManager import ConfigManager from collector_db.DTOConverter import DTOConverter @@ -21,7 +22,7 @@ from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, \ - UserUrlAgencySuggestion, status + UserUrlAgencySuggestion from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo @@ -1012,8 +1013,6 @@ async def set_approved_metadata( selected_metadata.validation_status = ValidationStatus.VALIDATED.value selected_metadata.validation_source = ValidationSource.MANUAL.value - - # Get URL query = ( @@ -1058,12 +1057,3 @@ async def set_approved_metadata( url.outcome = URLStatus.APPROVED - - - - # Confirm that URL has - # - a confirmed agency - # - a validated record_type - # - a validated relevant - - diff --git a/collector_db/models.py b/collector_db/models.py index f462198c..338d83ce 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -80,7 +80,15 @@ class URL(Base): collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. outcome = Column( - postgresql.ENUM('pending', 'submitted', 'human_labeling', 'rejected', 'duplicate', 'error', name='url_status'), + postgresql.ENUM( + 'pending', + 'submitted', + 'human_labeling', + 'rejected', + 'duplicate', + 'error', + name='url_status' + ), nullable=False ) created_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 4ca4129c..4e6794d8 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -7,6 +7,7 @@ from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.enums import TaskType, URLMetadataAttributeType +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse @@ -225,4 +226,13 @@ async def get_next_source_for_review(self): return await self.adb_client.get_next_url_for_final_review() async def approve_and_get_next_source_for_review( - self, url_id: int): + self, + approval_info: FinalReviewApprovalInfo + ): + await self.adb_client.approve_url( + url_id=approval_info.url_id, + is_approved=approval_info.is_approved, + record_type=approval_info.record_type, + relevant=approval_info.relevant + ) + return await self.get_next_source_for_review() diff --git a/core/DTOs/FinalReviewApprovalInfo.py b/core/DTOs/FinalReviewApprovalInfo.py new file mode 100644 index 00000000..96af2f87 --- /dev/null +++ b/core/DTOs/FinalReviewApprovalInfo.py @@ -0,0 +1,26 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from core.enums import RecordType + + +class FinalReviewApprovalInfo(BaseModel): + url_id: int = Field( + title="The id of the URL." + ) + record_type: Optional[RecordType] = Field( + title="The final record type of the URL." + "If none, defers to the existing value from the auto-labeler only if it exists.", + default=None + ) + relevant: Optional[bool] = Field( + title="Final determination on whether the URL is relevant." + "If none, defers to the existing value from the auto-labeler only if it exists.", + default=None + ) + agency_id: Optional[int] = Field( + title="The final confirmed agency for the URL. " + "If none, defers to an existing confirmed agency only if that exists.", + default=None + ) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 20b5c194..833e157a 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -361,3 +361,21 @@ async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator assert result is None +@pytest.mark.asyncio +async def test_approve_url_basic(db_data_creator: DBDataCreator): + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + # Add confirmed agency + await db_data_creator.agency_confirmed_suggestion( + url_id=url_mapping.url_id + ) + + # Approve URL. Only URL should be affected. No other properties should be changed. + await db_data_creator.adb_client.approve_url(url_mapping.url_id) + + + From b550fe5650a5466cb193d6e012ed0f8cedfc0308 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 23 Feb 2025 21:40:08 -0500 Subject: [PATCH 056/182] DRAFT - Begin Overhaul --- alembic.ini | 2 +- ...6ce_update_confirmed_url_agency_unique_.py | 44 ++++ ...0590bb_overhaul_annotation_organization.py | 225 ++++++++++++++++ api/routes/annotate.py | 19 +- collector_db/AsyncDatabaseClient.py | 171 ++++++++++-- collector_db/DTOConverter.py | 243 +++++++++-------- collector_db/StatementComposer.py | 22 +- collector_db/models.py | 244 ++++++++++++------ core/AsyncCore.py | 19 +- tests/helpers/DBDataCreator.py | 16 +- tests/helpers/complex_test_data_functions.py | 6 + .../integration/api/test_annotate.py | 36 +++ .../collector_db/test_db_client.py | 44 +++- 13 files changed, 858 insertions(+), 233 deletions(-) create mode 100644 alembic/versions/2025_02_23_0855-0c6dc00806ce_update_confirmed_url_agency_unique_.py create mode 100644 alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py diff --git a/alembic.ini b/alembic.ini index 9daecaa2..cfa2db9a 100644 --- a/alembic.ini +++ b/alembic.ini @@ -9,7 +9,7 @@ script_location = alembic # Uncomment the line below if you want the files to be prepended with date and time # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file # for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s # sys.path path, will be prepended to sys.path if present. # defaults to the current working directory. diff --git a/alembic/versions/2025_02_23_0855-0c6dc00806ce_update_confirmed_url_agency_unique_.py b/alembic/versions/2025_02_23_0855-0c6dc00806ce_update_confirmed_url_agency_unique_.py new file mode 100644 index 00000000..b081ec9d --- /dev/null +++ b/alembic/versions/2025_02_23_0855-0c6dc00806ce_update_confirmed_url_agency_unique_.py @@ -0,0 +1,44 @@ +"""Update confirmed_url_agency unique constraint to be only url_id + +Revision ID: 0c6dc00806ce +Revises: 76f902fe18cd +Create Date: 2025-02-23 08:55:07.046607 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '0c6dc00806ce' +down_revision: Union[str, None] = '76f902fe18cd' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.drop_constraint( + constraint_name="uq_confirmed_url_agency", + table_name="confirmed_url_agency", + ) + + op.create_unique_constraint( + constraint_name="uq_confirmed_url_agency", + table_name="confirmed_url_agency", + columns=["url_id"], + ) + + +def downgrade() -> None: + op.drop_constraint( + constraint_name="uq_confirmed_url_agency", + table_name="confirmed_url_agency", + ) + + op.create_unique_constraint( + constraint_name="uq_confirmed_url_agency", + table_name="confirmed_url_agency", + columns=["url_id", "agency_id"], + ) \ No newline at end of file diff --git a/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py b/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py new file mode 100644 index 00000000..4b453174 --- /dev/null +++ b/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py @@ -0,0 +1,225 @@ +"""Overhaul annotation organization + +New Tables +- AutoRelevantSuggestions +- AutoRecordTypeSuggestions +- UserRelevantSuggestions +- UserRecordTypeSuggestions + +New Columns for `URL` +- `agency_id` +- `record_type` +- `relevant` + +Removed Tables +- `URLMetadata` +- `ConfirmedURLAgency` +- `MetadataAnnotation` + +Revision ID: 33421c0590bb +Revises: 0c6dc00806ce +Create Date: 2025-02-23 10:23:19.696248 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import UniqueConstraint + +from core.enums import RecordType + +# revision identifiers, used by Alembic. +revision: str = '33421c0590bb' +down_revision: Union[str, None] = '0c6dc00806ce' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +record_type_values = [ + "Accident Reports", + "Arrest Records", + "Calls for Service", + "Car GPS", + "Citations", + "Dispatch Logs", + "Dispatch Recordings", + "Field Contacts", + "Incident Reports", + "Misc Police Activity", + "Officer Involved Shootings", + "Stops", + "Surveys", + "Use of Force Reports", + "Vehicle Pursuits", + "Complaints & Misconduct", + "Daily Activity Logs", + "Training & Hiring Info", + "Personnel Records", + "Annual & Monthly Reports", + "Budgets & Finances", + "Contact Info & Agency Meta", + "Geographic", + "List of Data Sources", + "Policies & Contracts", + "Crime Maps & Reports", + "Crime Statistics", + "Media Bulletins", + "Records Request Info", + "Resources", + "Sex Offender Registry", + "Wanted Persons", + "Booking Reports", + "Court Cases", + "Incarceration Records", + "Other" +] + + +record_type_enum = sa.Enum(*record_type_values, name='record_type') + +def upgrade() -> None: + # Delete the old tables + op.drop_table('metadata_annotations') + op.drop_table('url_metadata') + op.drop_table('confirmed_url_agency') + + # Create the new tables + op.create_table( + 'auto_relevant_suggestions', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column('relevant', sa.Boolean(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + UniqueConstraint( + 'url_id', + name='auto_relevant_suggestions_uq_url_id' + ) + ) + + op.create_table( + 'auto_record_type_suggestions', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column( + 'url_id', + sa.Integer(), + sa.ForeignKey('urls.id', ondelete='CASCADE'), + nullable=False + ), + sa.Column('record_type', record_type_enum, nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + UniqueConstraint( + 'url_id', + name='auto_record_type_suggestions_uq_url_id' + ) + ) + + op.create_table( + 'user_relevant_suggestions', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column( + 'url_id', + sa.Integer(), + sa.ForeignKey('urls.id', ondelete='CASCADE'), + nullable=False + ), + sa.Column('user_id', sa.Integer(), nullable=False), + sa.Column('relevant', sa.Boolean(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint("url_id", "user_id", name="uq_user_relevant_suggestions") + ) + + op.create_table( + 'user_record_type_suggestions', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column( + 'url_id', + sa.Integer(), + sa.ForeignKey('urls.id', ondelete='CASCADE'), + nullable=False + ), + sa.Column('user_id', sa.Integer(), nullable=False), + sa.Column('record_type', record_type_enum, nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint("url_id", "user_id", name="uq_user_record_type_suggestions") + ) + + # Add the new columns + op.add_column( + 'urls', + sa.Column('record_type', record_type_enum, nullable=True) + ) + + op.add_column( + 'urls', + sa.Column('relevant', sa.Boolean(), nullable=True) + ) + + op.add_column( + 'urls', + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey('agencies.agency_id', ondelete='NO ACTION'), + nullable=True + ) + ) + + + + +def downgrade() -> None: + # Drop the new tables + op.drop_table('auto_relevant_suggestions') + op.drop_table('auto_record_type_suggestions') + op.drop_table('user_relevant_suggestions') + op.drop_table('user_record_type_suggestions') + + # Drop the new columns + op.drop_column('urls', 'record_type') + op.drop_column('urls', 'relevant') + op.drop_column('urls', 'agency_id') + + # Create the old tables + op.create_table( + 'url_metadata', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column('attribute', sa.String(), nullable=False), + sa.Column('value', sa.String(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint( + "url_id", + "attribute", + name="uq_url_id_attribute"), + ) + + op.create_table( + 'confirmed_url_agency', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey('agencies.agency_id', ondelete='CASCADE'), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint("url_id", name="uq_confirmed_url_agency") + ) + + op.create_table( + 'metadata_annotations', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column('user_id', sa.Integer(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint( + "user_id", + "metadata_id", + name="metadata_annotations_uq_user_id_metadata_id"), + ) diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 591920ff..1c33a978 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -29,23 +29,24 @@ async def get_next_url_for_relevance_annotation( return result -@annotate_router.post("/relevance/{metadata_id}") +@annotate_router.post("/relevance/{url_id}") async def annotate_url_for_relevance_and_get_next_url( relevance_annotation_post_info: RelevanceAnnotationPostInfo, - metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) ) -> GetNextURLForAnnotationResponse: """ Post URL annotation and get next URL to annotate """ - result = await async_core.submit_and_get_next_url_for_annotation( + await async_core.submit_url_relevance_annotation( + user_id=access_info.user_id, + url_id=url_id, + relevant=relevance_annotation_post_info.is_relevant + ) + return await async_core.get_next_url_for_relevance_annotation( user_id=access_info.user_id, - metadata_id=metadata_id, - annotation=str(relevance_annotation_post_info.is_relevant), - metadata_type = URLMetadataAttributeType.RELEVANT ) - return result @annotate_router.get("/record-type") async def get_next_url_for_record_type_annotation( @@ -58,10 +59,10 @@ async def get_next_url_for_record_type_annotation( ) return result -@annotate_router.post("/record-type/{metadata_id}") +@annotate_router.post("/record-type/{url_id}") async def annotate_url_for_record_type_and_get_next_url( record_type_annotation_post_info: RecordTypeAnnotationPostInfo, - metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) ) -> GetNextURLForAnnotationResponse: diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index fbf22f7c..3e7eee14 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2,7 +2,7 @@ from typing import Optional, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select +from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select, not_ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, aliased, joinedload from starlette import status @@ -20,9 +20,10 @@ from collector_db.StatementComposer import StatementComposer from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_db.helper_functions import get_postgres_connection_string -from collector_db.models import URLMetadata, URL, URLErrorInfo, URLHTMLContent, Base, MetadataAnnotation, \ - RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, \ - UserUrlAgencySuggestion +from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ + RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ + UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ + UserRecordTypeSuggestion from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo @@ -92,6 +93,124 @@ async def get_url_metadata_by_status( model_result = scalar_result.all() return [URLMetadataInfo(**url_metadata.__dict__) for url_metadata in model_result] + # region relevant + @session_manager + async def add_auto_relevant_suggestion( + self, + session: AsyncSession, + url_id: int, + relevant: bool + ): + suggestion = AutoRelevantSuggestion( + url_id=url_id, + relevant=relevant + ) + session.add(suggestion) + + @session_manager + async def add_user_relevant_suggestion( + self, + session: AsyncSession, + url_id: int, + user_id: int, + relevant: bool + ): + suggestion = UserRelevantSuggestion( + url_id=url_id, + user_id=user_id, + relevant=relevant + ) + session.add(suggestion) + + @session_manager + async def get_next_url_for_relevance_annotation( + self, + session: AsyncSession, + user_id: int + ): + url_query = ( + select( + URL, + ) + # TODO: Generalize this whole section + .where(exists(select(URLHTMLContent).where(URLHTMLContent.url_id == URL.id))) + # URL must not have metadata annotation by this user + # TODO: Have this as a parameter for the user model + .where( + not_( + exists( + select(UserRelevantSuggestion) + .where( + UserRelevantSuggestion.url_id == URL.id, + UserRelevantSuggestion.user_id == user_id + ) + ) + ) + # TODO: Parameterize relationship attribute to joinedload + ).options( + joinedload(URL.auto_relevant_suggestions), + joinedload(URL.html_content) + ). + limit(1) + ) + + raw_result = await session.execute(url_query) + + url: URL = raw_result.scalars().one_or_none() + if url is None: + return None + + # Next, get all HTML content for the URL + html_response_info = DTOConverter.html_content_list_to_html_response_info( + url.html_content + ) + + # Get auto-suggestion if exists + if len(url.auto_relevant_suggestions) > 0: + auto_suggestion = url.auto_relevant_suggestions[0].relevant + else: + auto_suggestion = None + + return RelevanceAnnotationResponseInfo( + url_id=url.id, + suggested_relevant=auto_suggestion, + html_response_info=html_response_info + ) + + + + #endregion relevant + + @session_manager + async def add_auto_record_type_suggestion( + self, + session: AsyncSession, + url_id: int, + record_type: RecordType + ): + suggestion = AutoRecordTypeSuggestion( + url_id=url_id, + record_type=record_type.value + ) + session.add(suggestion) + + + @session_manager + async def add_user_record_type_suggestion( + self, + session: AsyncSession, + url_id: int, + user_id: int, + record_type: RecordType + ): + suggestion = UserRecordTypeSuggestion( + url_id=url_id, + user_id=user_id, + record_type=record_type.value + ) + session.add(suggestion) + + @session_manager async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo) -> int: result = await self._add_models(session, URLMetadata, [url_metadata_info]) @@ -281,9 +400,6 @@ async def get_next_url_for_annotation( .limit(1) ) - raw_result = await session.execute(subquery) - result = raw_result.all() - # Next, get all HTML content for the URL statement = ( @@ -334,17 +450,17 @@ async def add_metadata_annotation( ) session.add(annotation) - @session_manager - async def get_annotations_for_metadata_id( - self, - session: AsyncSession, - metadata_id: int - ) -> list[MetadataAnnotation]: - statement = (select(MetadataAnnotation). - where(MetadataAnnotation.metadata_id == metadata_id)) - scalar_result = await session.scalars(statement) - all_results = scalar_result.all() - return [MetadataAnnotationInfo(**result.__dict__) for result in all_results] + # @session_manager + # async def get_annotations_for_metadata_id( + # self, + # session: AsyncSession, + # metadata_id: int + # ) -> list[MetadataAnnotation]: + # statement = (select(MetadataAnnotation). + # where(MetadataAnnotation.metadata_id == metadata_id)) + # scalar_result = await session.scalars(statement) + # all_results = scalar_result.all() + # return [MetadataAnnotationInfo(**result.__dict__) for result in all_results] @session_manager async def get_all(self, session, model: Base): @@ -970,9 +1086,9 @@ async def approve_url( self, session: AsyncSession, url_id: int, - record_type: Optional[RecordType], - relevant: Optional[bool], - agency_id: Optional[int] + record_type: Optional[RecordType] = None, + relevant: Optional[bool] = None, + agency_id: Optional[int] = None ) -> None: async def set_approved_metadata( @@ -1040,9 +1156,16 @@ async def set_approved_metadata( ) # Check if agency_id exists as confirmed agency - confirmed_agency = url.confirmed_agencies + confirmed_agency = url.confirmed_agencies[0] if len(url.confirmed_agencies) > 0 else None + # If it doesn't, create it if confirmed_agency is None: + if agency_id is None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Must specify agency_id if URL does not already have a confirmed agency" + ) + confirmed_agency = ConfirmedUrlAgency( agency_id=agency_id, url_id=url_id @@ -1050,10 +1173,10 @@ async def set_approved_metadata( url.confirmed_agencies.append(confirmed_agency) # If a different agency exists as confirmed, overwrite it - elif confirmed_agency.agency_id != agency_id: + elif confirmed_agency.agency_id != agency_id and agency_id is not None: confirmed_agency.agency_id = agency_id # If it does, do nothing - url.outcome = URLStatus.APPROVED + url.outcome = URLStatus.APPROVED.value diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index 63c88d92..9b2b0d24 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -1,28 +1,30 @@ +from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType -from collector_db.models import URLMetadata, ConfirmedUrlAgency, AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, \ - MetadataAnnotation +from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyUserInfo from core.enums import RecordType, SuggestionType - - -def get_url_metadata( - url_metadatas: list[URLMetadata], - validation_status: ValidationStatus, - validation_source: ValidationSource, - attribute: URLMetadataAttributeType -): - for url_metadata in url_metadatas: - if url_metadata.validation_status != validation_status.value: - continue - if url_metadata.validation_source != validation_source.value: - continue - if url_metadata.attribute != attribute.value: - continue - return url_metadata - +from html_tag_collector.DataClassTags import convert_to_response_html_info, ResponseHTMLInfo, ENUM_TO_ATTRIBUTE_MAPPING + + +# +# def get_url_metadata( +# url_metadatas: list[URLMetadata], +# validation_status: ValidationStatus, +# validation_source: ValidationSource, +# attribute: URLMetadataAttributeType +# ): +# for url_metadata in url_metadatas: +# if url_metadata.validation_status != validation_status.value: +# continue +# if url_metadata.validation_source != validation_source.value: +# continue +# if url_metadata.attribute != attribute.value: +# continue +# return url_metadata +# class DTOConverter: @@ -30,67 +32,67 @@ class DTOConverter: """ Converts SQLAlchemy objects to DTOs """ - - @staticmethod - def final_review_annotation_relevant_info( - url_metadatas: list[URLMetadata] - ) -> FinalReviewAnnotationRelevantInfo: - relevant_metadata = get_url_metadata( - url_metadatas=url_metadatas, - validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING, - attribute=URLMetadataAttributeType.RELEVANT - ) - auto_value = relevant_metadata.value if relevant_metadata else None - if auto_value is not None: - auto_value = (auto_value == "True") - - - annotations: list[MetadataAnnotation] = relevant_metadata.annotations if relevant_metadata else [] - relevant_count = 0 - not_relevant_count = 0 - for annotation in annotations: - if annotation.value == "True": - relevant_count += 1 - else: - not_relevant_count += 1 - return FinalReviewAnnotationRelevantInfo( - auto=auto_value, - users=FinalReviewAnnotationRelevantUsersInfo( - relevant=relevant_count, - not_relevant=not_relevant_count - ) - ) - - @staticmethod - def final_review_annotation_record_type_info( - url_metadata: list[URLMetadata] - ): - record_type_metadata = get_url_metadata( - url_metadatas=url_metadata, - validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING, - attribute=URLMetadataAttributeType.RECORD_TYPE - ) - user_count = {} - if record_type_metadata is None: - auto_value = None - annotations = [] - else: - auto_value = RecordType(record_type_metadata.value) - annotations = record_type_metadata.annotations - for annotation in annotations: - value = RecordType(annotation.value) - if value not in user_count: - user_count[value] = 0 - user_count[value] += 1 - # Sort users by count, descending - user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) - - return FinalReviewAnnotationRecordTypeInfo( - auto=auto_value, - users=user_count - ) + # + # @staticmethod + # def final_review_annotation_relevant_info( + # url_metadatas: list[URLMetadata] + # ) -> FinalReviewAnnotationRelevantInfo: + # relevant_metadata = get_url_metadata( + # url_metadatas=url_metadatas, + # validation_status=ValidationStatus.PENDING_VALIDATION, + # validation_source=ValidationSource.MACHINE_LEARNING, + # attribute=URLMetadataAttributeType.RELEVANT + # ) + # auto_value = relevant_metadata.value if relevant_metadata else None + # if auto_value is not None: + # auto_value = (auto_value == "True") + # + # + # annotations: list[MetadataAnnotation] = relevant_metadata.annotations if relevant_metadata else [] + # relevant_count = 0 + # not_relevant_count = 0 + # for annotation in annotations: + # if annotation.value == "True": + # relevant_count += 1 + # else: + # not_relevant_count += 1 + # return FinalReviewAnnotationRelevantInfo( + # auto=auto_value, + # users=FinalReviewAnnotationRelevantUsersInfo( + # relevant=relevant_count, + # not_relevant=not_relevant_count + # ) + # ) + # + # @staticmethod + # def final_review_annotation_record_type_info( + # url_metadata: list[URLMetadata] + # ): + # record_type_metadata = get_url_metadata( + # url_metadatas=url_metadata, + # validation_status=ValidationStatus.PENDING_VALIDATION, + # validation_source=ValidationSource.MACHINE_LEARNING, + # attribute=URLMetadataAttributeType.RECORD_TYPE + # ) + # user_count = {} + # if record_type_metadata is None: + # auto_value = None + # annotations = [] + # else: + # auto_value = RecordType(record_type_metadata.value) + # annotations = record_type_metadata.annotations + # for annotation in annotations: + # value = RecordType(annotation.value) + # if value not in user_count: + # user_count[value] = 0 + # user_count[value] += 1 + # # Sort users by count, descending + # user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) + # + # return FinalReviewAnnotationRecordTypeInfo( + # auto=auto_value, + # users=user_count + # ) @staticmethod def final_review_annotation_agency_auto_info( @@ -158,41 +160,60 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( # Return sorted return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) + # + # @staticmethod + # def final_review_annotation_agency_info( + # automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], + # confirmed_agencies: list[ConfirmedUrlAgency], + # user_agency_suggestions: list[UserUrlAgencySuggestion] + # ): + # if len(confirmed_agencies) == 1: + # confirmed_agency = confirmed_agencies[0] + # confirmed_agency_info = GetNextURLForAgencyAgencyInfo( + # suggestion_type=SuggestionType.CONFIRMED, + # pdap_agency_id=confirmed_agency.agency_id, + # agency_name=confirmed_agency.agency.name, + # state=confirmed_agency.agency.state, + # county=confirmed_agency.agency.county, + # locality=confirmed_agency.agency.locality + # ) + # return FinalReviewAnnotationAgencyInfo( + # confirmed=confirmed_agency_info, + # users=None, + # auto=None + # ) + # + # + # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + # automated_agency_suggestions + # ) + # + # agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + # user_agency_suggestions + # ) + # + # return FinalReviewAnnotationAgencyInfo( + # confirmed=None, + # users=agency_user_info, + # auto=agency_auto_info + # ) + # @staticmethod - def final_review_annotation_agency_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agencies: list[ConfirmedUrlAgency], - user_agency_suggestions: list[UserUrlAgencySuggestion] - ): - if len(confirmed_agencies) == 1: - confirmed_agency = confirmed_agencies[0] - confirmed_agency_info = GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=confirmed_agency.agency_id, - agency_name=confirmed_agency.agency.name, - state=confirmed_agency.agency.state, - county=confirmed_agency.agency.county, - locality=confirmed_agency.agency.locality - ) - return FinalReviewAnnotationAgencyInfo( - confirmed=confirmed_agency_info, - users=None, - auto=None - ) + def html_content_list_to_html_response_info(html_content_list: list[URLHTMLContent]): + response_html_info = ResponseHTMLInfo() + for html_content in html_content_list: + content_type = HTMLContentType(html_content.content_type) + content = html_content.content - agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - automated_agency_suggestions - ) + setattr( + response_html_info, + ENUM_TO_ATTRIBUTE_MAPPING[content_type], + content + ) - agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestions - ) - return FinalReviewAnnotationAgencyInfo( - confirmed=None, - users=agency_user_info, - auto=agency_auto_info - ) + return response_html_info + diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index fd1b11a9..6b6ab677 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -3,8 +3,7 @@ from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus -from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation, AutomatedUrlAgencySuggestion, \ - ConfirmedUrlAgency +from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion from collector_manager.enums import URLStatus @@ -72,4 +71,21 @@ def exclude_urls_with_agency_suggestions( .where(~exists().where(ConfirmedAgency.url_id == URL.id)) ) # Exclude if confirmed agencies exist - return statement \ No newline at end of file + return statement + + @staticmethod + def get_all_html_content_for_url(subquery) -> Select: + statement = ( + select( + subquery.c.url, + subquery.c.metadata_id, + subquery.c.value, + URLHTMLContent.content_type, + URLHTMLContent.content, + ) + .join(URLHTMLContent) + .where(subquery.c.url_id == URLHTMLContent.url_id) + ) + + raw_result = await session.execute(statement) + result = raw_result.all() \ No newline at end of file diff --git a/collector_db/models.py b/collector_db/models.py index 338d83ce..8e3d2a7d 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -7,7 +7,7 @@ from sqlalchemy.orm import declarative_base, relationship from collector_db.enums import PGEnum -from core.enums import BatchStatus +from core.enums import BatchStatus, RecordType from util.helper_functions import get_enum_values # Base class for SQLAlchemy ORM models @@ -19,6 +19,15 @@ batch_status_enum = PGEnum('complete', 'error', 'in-process', 'aborted', name='batch_status') +record_type_values = get_enum_values(RecordType) + +def get_created_at_column(): + return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + +def get_updated_at_column(): + return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT, onupdate=CURRENT_TIME_SERVER_DEFAULT) + + class Batch(Base): __tablename__ = 'batches' @@ -91,13 +100,15 @@ class URL(Base): ), nullable=False ) - created_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) - updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + agency_id = Column(Integer, ForeignKey('agencies.agency_id', name='fk_url_agency_id')) + record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) + relevant = Column(Boolean, nullable=True) + created_at = get_created_at_column() + updated_at = get_updated_at_column() # Relationships batch = relationship("Batch", back_populates="urls") duplicates = relationship("Duplicate", back_populates="original_url") - url_metadata = relationship("URLMetadata", back_populates="url", cascade="all, delete-orphan") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") error_info = relationship("URLErrorInfo", back_populates="url", cascade="all, delete-orphan") tasks = relationship( @@ -105,60 +116,63 @@ class URL(Base): secondary="link_task_urls", back_populates="urls", ) + agency = relationship("Agency", back_populates="urls") automated_agency_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="url") user_agency_suggestions = relationship("UserUrlAgencySuggestion", back_populates="url") - confirmed_agencies = relationship("ConfirmedUrlAgency", back_populates="url") - - -# URL Metadata table definition -class URLMetadata(Base): - __tablename__ = 'url_metadata' - __table_args__ = (UniqueConstraint( - "url_id", - "attribute", - name="uq_url_id_attribute"), - ) - - id = Column(Integer, primary_key=True, autoincrement=True) - url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) - attribute = Column( - PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), - nullable=False) - value = Column(Text, nullable=False) - validation_status = Column( - PGEnum('Pending Validation', 'Validated', name='metadata_validation_status'), - nullable=False) - validation_source = Column( - PGEnum('Machine Learning', 'Label Studio', 'Manual', name='validation_source'), - nullable=False - ) - notes = Column(Text, nullable=True) - - - # Timestamps - created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) - updated_at = Column(TIMESTAMP, nullable=False, server_default=func.now(), onupdate=func.now()) - - # Relationships - url = relationship("URL", back_populates="url_metadata") - annotations = relationship("MetadataAnnotation", back_populates="url_metadata") - -class MetadataAnnotation(Base): - __tablename__ = 'metadata_annotations' - __table_args__ = (UniqueConstraint( - "user_id", - "metadata_id", - name="metadata_annotations_uq_user_id_metadata_id"), - ) - - id = Column(Integer, primary_key=True, autoincrement=True) - user_id = Column(Integer, nullable=False) - metadata_id = Column(Integer, ForeignKey('url_metadata.id'), nullable=False) - value = Column(Text, nullable=False) - created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) - - # Relationships - url_metadata = relationship("URLMetadata", back_populates="annotations") + auto_record_type_suggestions = relationship("AutoRecordTypeSuggestion", back_populates="url") + user_record_type_suggestions = relationship("UserRecordTypeSuggestion", back_populates="url") + auto_relevant_suggestions = relationship("AutoRelevantSuggestion", back_populates="url") + user_relevant_suggestions = relationship("UserRelevantSuggestion", back_populates="url") + +# # URL Metadata table definition +# class URLMetadata(Base): +# __tablename__ = 'url_metadata' +# __table_args__ = (UniqueConstraint( +# "url_id", +# "attribute", +# name="uq_url_id_attribute"), +# ) +# +# id = Column(Integer, primary_key=True, autoincrement=True) +# url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) +# attribute = Column( +# PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), +# nullable=False) +# value = Column(Text, nullable=False) +# validation_status = Column( +# PGEnum('Pending Validation', 'Validated', name='metadata_validation_status'), +# nullable=False) +# validation_source = Column( +# PGEnum('Machine Learning', 'Label Studio', 'Manual', name='validation_source'), +# nullable=False +# ) +# notes = Column(Text, nullable=True) +# +# +# # Timestamps +# created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) +# updated_at = Column(TIMESTAMP, nullable=False, server_default=func.now(), onupdate=func.now()) +# +# # Relationships +# url = relationship("URL", back_populates="url_metadata") +# annotations = relationship("MetadataAnnotation", back_populates="url_metadata") + +# class MetadataAnnotation(Base): +# __tablename__ = 'metadata_annotations' +# __table_args__ = (UniqueConstraint( +# "user_id", +# "metadata_id", +# name="metadata_annotations_uq_user_id_metadata_id"), +# ) +# +# id = Column(Integer, primary_key=True, autoincrement=True) +# user_id = Column(Integer, nullable=False) +# metadata_id = Column(Integer, ForeignKey('url_metadata.id'), nullable=False) +# value = Column(Text, nullable=False) +# created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) +# +# # Relationships +# url_metadata = relationship("URLMetadata", back_populates="annotations") class RootURL(Base): __tablename__ = 'root_url_cache' @@ -172,7 +186,7 @@ class RootURL(Base): url = Column(String, nullable=False) page_title = Column(String, nullable=False) page_description = Column(String, nullable=True) - updated_at = Column(TIMESTAMP, nullable=False, server_default=func.now(), onupdate=func.now()) + updated_at = get_updated_at_column() class URLErrorInfo(Base): @@ -186,7 +200,7 @@ class URLErrorInfo(Base): id = Column(Integer, primary_key=True) url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) error = Column(Text, nullable=False) - updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + updated_at = get_updated_at_column() task_id = Column(Integer, ForeignKey('tasks.id'), nullable=False) # Relationships @@ -208,7 +222,7 @@ class URLHTMLContent(Base): nullable=False) content = Column(Text, nullable=False) - updated_at = Column(TIMESTAMP, nullable=False, server_default=func.now(), onupdate=func.now()) + updated_at = get_updated_at_column() # Relationships url = relationship("URL", back_populates="html_content") @@ -245,7 +259,7 @@ class Log(Base): id = Column(Integer, primary_key=True) batch_id = Column(Integer, ForeignKey('batches.id'), nullable=False) log = Column(Text, nullable=False) - created_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + created_at = get_created_at_column() # Relationships batch = relationship("Batch", back_populates="logs") @@ -258,7 +272,7 @@ class Missing(Base): record_type = Column(String, nullable=False) batch_id = Column(Integer, ForeignKey('batches.id')) strategy_used = Column(Text, nullable=False) - date_searched = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + date_searched = get_created_at_column() # Relationships batch = relationship("Batch", back_populates="missings") @@ -276,7 +290,7 @@ class Task(Base): name='task_type' ), nullable=False) task_status = Column(batch_status_enum, nullable=False) - updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + updated_at = get_updated_at_column() # Relationships urls = relationship( @@ -306,7 +320,7 @@ class TaskError(Base): id = Column(Integer, primary_key=True) task_id = Column(Integer, ForeignKey('tasks.id', ondelete="CASCADE"), nullable=False) error = Column(Text, nullable=False) - updated_at = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) + updated_at = get_updated_at_column() # Relationships task = relationship("Task", back_populates="error") @@ -325,27 +339,27 @@ class Agency(Base): state = Column(String, nullable=True) county = Column(String, nullable=True) locality = Column(String, nullable=True) - updated_at = Column(DateTime, nullable=False, default=func.now()) + updated_at = get_updated_at_column() # Relationships - confirmed_urls = relationship("ConfirmedUrlAgency", back_populates="agency") + urls = relationship("URL", back_populates="agency") automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") -class ConfirmedUrlAgency(Base): - __tablename__ = "confirmed_url_agency" - - id = Column(Integer, primary_key=True, autoincrement=True) - agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) - url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) - - agency = relationship("Agency", back_populates="confirmed_urls") - url = relationship("URL", back_populates="confirmed_agencies") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", name="uq_confirmed_url_agency"), - ) +# class ConfirmedUrlAgency(Base): +# __tablename__ = "confirmed_url_agency" +# +# id = Column(Integer, primary_key=True, autoincrement=True) +# agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) +# url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) +# +# agency = relationship("Agency", back_populates="confirmed_urls") +# url = relationship("URL", back_populates="confirmed_agencies") +# +# __table_args__ = ( +# UniqueConstraint("url_id", name="uq_confirmed_url_agency"), +# ) class AutomatedUrlAgencySuggestion(Base): @@ -378,4 +392,76 @@ class UserUrlAgencySuggestion(Base): __table_args__ = ( UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), - ) \ No newline at end of file + ) + +class AutoRelevantSuggestion(Base): + __tablename__ = "auto_relevant_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + relevant = Column(Boolean, nullable=True) + created_at = get_created_at_column() + updated_at = get_updated_at_column() + + __table_args__ = ( + UniqueConstraint("url_id", name="auto_relevant_suggestions_uq_url_id"), + ) + + # Relationships + + url = relationship("URL", back_populates="auto_relevant_suggestions") + + +class AutoRecordTypeSuggestion(Base): + __tablename__ = "auto_record_type_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) + created_at = get_created_at_column() + updated_at = get_updated_at_column() + + __table_args__ = ( + UniqueConstraint("url_id", name="auto_record_type_suggestions_uq_url_id"), + ) + + # Relationships + + url = relationship("URL", back_populates="auto_record_type_suggestions") + +class UserRelevantSuggestion(Base): + __tablename__ = "user_relevant_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + user_id = Column(Integer, nullable=False) + relevant = Column(Boolean, nullable=False) + created_at = get_created_at_column() + updated_at = get_updated_at_column() + + __table_args__ = ( + UniqueConstraint("url_id", "user_id", name="uq_user_relevant_suggestions"), + ) + + # Relationships + + url = relationship("URL", back_populates="user_relevant_suggestions") + + +class UserRecordTypeSuggestion(Base): + __tablename__ = "user_record_type_suggestions" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + user_id = Column(Integer, nullable=False) + record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) + created_at = get_created_at_column() + updated_at = get_updated_at_column() + + __table_args__ = ( + UniqueConstraint("url_id", "user_id", name="uq_user_record_type_suggestions"), + ) + + # Relationships + + url = relationship("URL", back_populates="user_record_type_suggestions") \ No newline at end of file diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 4e6794d8..39c1fc46 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -174,6 +174,21 @@ async def submit_and_get_next_url_for_annotation( ) return result + async def submit_url_relevance_annotation( + self, + user_id: int, + url_id: int, + relevant: bool + ): + return await self.adb_client.add_user_relevant_suggestion( + user_id=user_id, + url_id=url_id, + relevant=relevant + ) + + async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextURLForAnnotationResponse: + return await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + async def submit_url_annotation( self, user_id: int, @@ -231,8 +246,8 @@ async def approve_and_get_next_source_for_review( ): await self.adb_client.approve_url( url_id=approval_info.url_id, - is_approved=approval_info.is_approved, record_type=approval_info.record_type, - relevant=approval_info.relevant + relevant=approval_info.relevant, + agency_id=approval_info.agency_id ) return await self.get_next_source_for_review() diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 16f73602..b33051b5 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -79,6 +79,13 @@ async def agency(self) -> int: ) return agency_id + async def auto_relevant_suggestions(self, url_id: int, relevant: bool = True): + await self.adb_client.add_auto_relevant_suggestion( + url_id=url_id, + relevant=relevant + ) + + async def auto_suggestions( self, url_ids: list[int], @@ -264,16 +271,21 @@ async def agency_confirmed_suggestion( self, url_id: int ): - + """ + Creates a confirmed agency suggestion + and returns the auto-generated pdap_agency_id + """ + agency_id = await self.agency() await self.adb_client.add_confirmed_agency_url_links( suggestions=[ URLAgencySuggestionInfo( url_id=url_id, suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=await self.agency() + pdap_agency_id=agency_id ) ] ) + return agency_id async def agency_user_suggestions( self, diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 57f4c412..b45fb14a 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -8,6 +8,12 @@ async def setup_for_get_next_url_for_final_review( annotation_count: int, include_user_annotations: bool = True ): + """ + Sets up the database to test the final_review functions + Auto-labels the URL with 'relevant=True' and 'record_type=ARREST_RECORDS' + And applies auto-generated user annotations + """ + batch_id = db_data_creator.batch() url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] await db_data_creator.html_data([url_mapping.url_id]) diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index ef3693c2..fe5d4c28 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -90,6 +90,42 @@ async def run_annotation_test( @pytest.mark.asyncio async def test_annotate_relevancy(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + # Call `GET` `/annotate/url` and receive next URL + request_info_1: GetNextURLForAnnotationResponse = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + # Validate presence of HTML data in `html` field + assert inner_info_1.html_info.description != "" + assert inner_info_1.html_info.title != "" + assert inner_info_1.suggested_value == "True" + + + + await run_annotation_test( api_test_helper=api_test_helper, submit_and_get_next_function=api_test_helper.request_validator.post_relevance_annotation_and_get_next, diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 833e157a..8d970d48 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -9,6 +9,7 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource +from collector_db.models import ConfirmedUrlAgency, MetadataAnnotation, URL from collector_manager.enums import URLStatus from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator @@ -370,12 +371,51 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): ) # Add confirmed agency - await db_data_creator.agency_confirmed_suggestion( + agency_id = await db_data_creator.agency_confirmed_suggestion( url_id=url_mapping.url_id ) + adb_client = db_data_creator.adb_client # Approve URL. Only URL should be affected. No other properties should be changed. - await db_data_creator.adb_client.approve_url(url_mapping.url_id) + await adb_client.approve_url(url_mapping.url_id) + # Confirm same agency id is listed as confirmed + confirmed_agencies = await adb_client.get_all( + ConfirmedUrlAgency + ) + assert len(confirmed_agencies) == 1 + confirmed_agency = confirmed_agencies[0] + assert confirmed_agency.url_id == url_mapping.url_id + assert confirmed_agency.agency_id == agency_id + + # Confirm two metadata entries + metadatas = await adb_client.get_all( + MetadataAnnotation + ) + assert len(metadatas) == 2 + record_type_metadata = None + relevant_metadata = None + for metadata in metadatas: + if metadata.attribute == URLMetadataAttributeType.RECORD_TYPE.value: + record_type_metadata = metadata + elif metadata.attribute == URLMetadataAttributeType.RELEVANT.value: + relevant_metadata = metadata + + # - One is Record Type, with record type as ARREST_RECORDS and set as approved + + assert record_type_metadata.value == RecordType.ARREST_RECORDS.value + assert record_type_metadata.validation_status == ValidationStatus.VALIDATED.value + # - One is Relevant, and is set as TRUE and approved + + assert relevant_metadata.value == "True" + assert relevant_metadata.validation_status == ValidationStatus.VALIDATED.value + + # Confirm URL + urls = await adb_client.get_all( + URL + ) + assert len(urls) == 1 + url = urls[0] + assert url.status == URLStatus.APPROVED From a3598e982d6dbd6e5729ac27a13624c212b6a06a Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 24 Feb 2025 13:59:02 -0500 Subject: [PATCH 057/182] DRAFT --- api/routes/annotate.py | 9 +- collector_db/AsyncDatabaseClient.py | 155 ++++++++++++------ collector_db/StatementComposer.py | 2 +- core/AsyncCore.py | 28 +++- .../GetNextRelevanceAnnotationResponseInfo.py | 22 +++ core/DTOs/ResponseURLInfo.py | 6 + .../api/helpers/RequestValidator.py | 21 +-- .../integration/api/test_annotate.py | 90 ++++++++-- 8 files changed, 248 insertions(+), 85 deletions(-) create mode 100644 core/DTOs/GetNextRelevanceAnnotationResponseInfo.py create mode 100644 core/DTOs/ResponseURLInfo.py diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 1c33a978..d5f2e709 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -3,6 +3,8 @@ from api.dependencies import get_async_core from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ + GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse @@ -21,10 +23,9 @@ async def get_next_url_for_relevance_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAnnotationResponse: - result = await async_core.get_next_url_for_annotation( +) -> GetNextRelevanceAnnotationResponseOuterInfo: + result = await async_core.get_next_url_for_relevance_annotation( user_id=access_info.user_id, - metadata_type=URLMetadataAttributeType.RELEVANT ) return result @@ -35,7 +36,7 @@ async def annotate_url_for_relevance_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) -) -> GetNextURLForAnnotationResponse: +) -> GetNextRelevanceAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate """ diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 3e7eee14..3d34f024 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2,9 +2,9 @@ from typing import Optional, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select, not_ +from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select, not_, and_ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload, aliased, joinedload +from sqlalchemy.orm import selectinload, aliased, joinedload, Mapped, QueryableAttribute from starlette import status from collector_db.ConfigManager import ConfigManager @@ -15,6 +15,7 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo from collector_db.DTOs.URLInfo import URLInfo +from collector_db.DTOs.URLMapping import URLMapping from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.StatementComposer import StatementComposer @@ -25,6 +26,7 @@ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ @@ -33,16 +35,23 @@ from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.DTOs.ResponseURLInfo import ResponseURLInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info +# Type Hints + +UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion +AutoSuggestionModel = AutoRelevantSuggestion or AutoRecordTypeSuggestion or AutomatedUrlAgencySuggestion + def add_standard_limit_and_offset(statement, page, limit=100): offset = (page - 1) * limit return statement.limit(limit).offset(offset) + class AsyncDatabaseClient: def __init__(self, db_url: str = get_postgres_connection_string(is_async=True)): self.engine = create_async_engine( @@ -59,11 +68,10 @@ async def _add_models(session: AsyncSession, model_class, models) -> list[int]: await session.flush() return [instance.id for instance in instances] - - @staticmethod def session_manager(method): """Decorator to manage async session lifecycle.""" + @wraps(method) async def wrapper(self, *args, **kwargs): async with self.session_maker() as session: @@ -74,6 +82,7 @@ async def wrapper(self, *args, **kwargs): except Exception as e: await session.rollback() raise e + return wrapper @session_manager @@ -107,27 +116,29 @@ async def add_auto_relevant_suggestion( ) session.add(suggestion) - @session_manager - async def add_user_relevant_suggestion( - self, + @staticmethod + async def get_user_suggestion( session: AsyncSession, - url_id: int, + model: UserSuggestionModel, user_id: int, - relevant: bool - ): - suggestion = UserRelevantSuggestion( - url_id=url_id, - user_id=user_id, - relevant=relevant + url_id: int + ) -> Optional[UserSuggestionModel]: + statement = Select(model).where( + and_( + model.url_id == url_id, + model.user_id == user_id + ) ) - session.add(suggestion) + result = await session.execute(statement) + return result.unique().scalar_one_or_none() - @session_manager - async def get_next_url_for_relevance_annotation( - self, + @staticmethod + async def get_next_url_for_user_annotation( session: AsyncSession, + user_suggestion_model_to_exclude: UserSuggestionModel, + auto_suggestion_relationship: QueryableAttribute, user_id: int - ): + ) -> URL: url_query = ( select( URL, @@ -139,16 +150,16 @@ async def get_next_url_for_relevance_annotation( .where( not_( exists( - select(UserRelevantSuggestion) + select(user_suggestion_model_to_exclude) .where( - UserRelevantSuggestion.url_id == URL.id, - UserRelevantSuggestion.user_id == user_id + user_suggestion_model_to_exclude.url_id == URL.id, + user_suggestion_model_to_exclude.user_id == user_id ) ) ) - # TODO: Parameterize relationship attribute to joinedload + # TODO: Parameterize relationship attribute to joinedload ).options( - joinedload(URL.auto_relevant_suggestions), + joinedload(auto_suggestion_relationship), joinedload(URL.html_content) ). limit(1) @@ -156,7 +167,46 @@ async def get_next_url_for_relevance_annotation( raw_result = await session.execute(url_query) - url: URL = raw_result.scalars().one_or_none() + return raw_result.unique().scalars().one_or_none() + + @session_manager + async def add_user_relevant_suggestion( + self, + session: AsyncSession, + url_id: int, + user_id: int, + relevant: bool + ): + prior_suggestion = await self.get_user_suggestion( + session, + model=UserRelevantSuggestion, + user_id=user_id, + url_id=url_id + ) + if prior_suggestion is not None: + prior_suggestion.relevant = relevant + return + + suggestion = UserRelevantSuggestion( + url_id=url_id, + user_id=user_id, + relevant=relevant + ) + session.add(suggestion) + + @session_manager + async def get_next_url_for_relevance_annotation( + self, + session: AsyncSession, + user_id: int + ) -> Optional[GetNextRelevanceAnnotationResponseInfo]: + + url = await self.get_next_url_for_user_annotation( + session, + user_suggestion_model_to_exclude=UserRelevantSuggestion, + auto_suggestion_relationship=URL.auto_relevant_suggestions, + user_id=user_id + ) if url is None: return None @@ -171,15 +221,27 @@ async def get_next_url_for_relevance_annotation( else: auto_suggestion = None - return RelevanceAnnotationResponseInfo( - url_id=url.id, + return GetNextRelevanceAnnotationResponseInfo( + url_info=URLMapping( + url=url.url, + url_id=url.id + ), suggested_relevant=auto_suggestion, - html_response_info=html_response_info + html_info=html_response_info ) + #endregion relevant + + #region record_type + + @session_manager + async def get_next_url_for_record_type_annotation( + self, + session: AsyncSession, + user_id: int + ) = : - #endregion relevant @session_manager async def add_auto_record_type_suggestion( @@ -194,7 +256,6 @@ async def add_auto_record_type_suggestion( ) session.add(suggestion) - @session_manager async def add_user_record_type_suggestion( self, @@ -210,6 +271,7 @@ async def add_user_record_type_suggestion( ) session.add(suggestion) + #endregion record_type @session_manager async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo) -> int: @@ -304,14 +366,13 @@ async def get_urls_with_html_data_and_without_metadata_type( ) final_results.append(url_with_html) - return final_results @session_manager async def has_pending_urls_with_html_data_and_without_metadata_type( - self, - session: AsyncSession, - without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT + self, + session: AsyncSession, + without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT ) -> bool: # TODO: Generalize this so that it can exclude based on other attributes # Get URLs with no relevancy metadata @@ -357,7 +418,8 @@ async def get_urls_with_metadata( return final_results @session_manager - async def update_url_metadata_status(self, session: AsyncSession, metadata_ids: list[int], validation_status: ValidationStatus): + async def update_url_metadata_status(self, session: AsyncSession, metadata_ids: list[int], + validation_status: ValidationStatus): for metadata_id in metadata_ids: statement = select(URLMetadata).where(URLMetadata.id == metadata_id) scalar_result = await session.scalars(statement) @@ -631,8 +693,6 @@ async def get_html_content_info(self, session: AsyncSession, url_id: int) -> lis results = session_result.scalars().all() return [URLHTMLContentInfo(**result.__dict__) for result in results] - - @session_manager async def link_urls_to_task(self, session: AsyncSession, task_id: int, url_ids: list[int]): for url_id in url_ids: @@ -932,7 +992,6 @@ async def get_next_url_for_final_review( group_by(URLMetadata.url_id).subquery() ) - # Count whether agency auto annotations exist # (Note: Can be either confirmed or auto suggestion) agency_annotations_exist_subquery = ( @@ -990,23 +1049,21 @@ async def get_next_url_for_final_review( ).group_by(UserUrlAgencySuggestion.url_id).subquery() ) - - # Basic URL query url_query = ( select( URL, ( - func.coalesce(distinct_auto_metadata_subquery.c.auto_count, 0) + - func.coalesce(distinct_user_metadata_subquery.c.user_count, 0) + - func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + - func.coalesce(agency_user_annotations_exist_subquery.c.agency_user_annotations_exist, 0) + func.coalesce(distinct_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(distinct_user_metadata_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + + func.coalesce(agency_user_annotations_exist_subquery.c.agency_user_annotations_exist, 0) ).label("total_distinct_annotation_count"), ( - func.coalesce(all_auto_metadata_subquery.c.auto_count, 0) + - func.coalesce(all_user_metadata_subquery.c.user_count, 0) + - func.coalesce(all_user_agency_annotations_subquery.c.user_count, 0) + - func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + func.coalesce(all_auto_metadata_subquery.c.auto_count, 0) + + func.coalesce(all_user_metadata_subquery.c.user_count, 0) + + func.coalesce(all_user_agency_annotations_subquery.c.user_count, 0) + + func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) ).label("total_overall_annotation_count") ).outerjoin( distinct_auto_metadata_subquery, URL.id == distinct_auto_metadata_subquery.c.url_id @@ -1043,7 +1100,6 @@ async def get_next_url_for_final_review( desc("total_overall_annotation_count"), ) - # Apply limit url_query = url_query.limit(1) @@ -1179,4 +1235,3 @@ async def set_approved_metadata( # If it does, do nothing url.outcome = URLStatus.APPROVED.value - diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 6b6ab677..b3ab5312 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -74,7 +74,7 @@ def exclude_urls_with_agency_suggestions( return statement @staticmethod - def get_all_html_content_for_url(subquery) -> Select: + async def get_all_html_content_for_url(subquery) -> Select: statement = ( select( subquery.c.url, diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 39c1fc46..a4cea685 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -8,6 +8,7 @@ from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.enums import TaskType, URLMetadataAttributeType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse @@ -20,7 +21,7 @@ from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator -from core.enums import BatchStatus, SuggestionType +from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface @@ -186,8 +187,29 @@ async def submit_url_relevance_annotation( relevant=relevant ) - async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextURLForAnnotationResponse: - return await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextRelevanceAnnotationResponseOuterInfo: + next_annotation = await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + return GetNextRelevanceAnnotationResponseOuterInfo( + next_annotation=next_annotation + ) + + async def get_next_url_for_record_type_annotation(self, user_id: int) -> GetNextRecordTypeAnnotationResponseOuterInfo: + next_annotation = await self.adb_client.get_next_url_for_record_type_annotation(user_id=user_id) + return GetNextRecordTypeAnnotationResponseOuterInfo( + next_annotation=next_annotation + ) + + async def submit_url_record_type_annotation( + self, + user_id: int, + url_id: int, + record_type: RecordType + ): + return await self.adb_client.add_user_record_type_suggestion( + user_id=user_id, + url_id=url_id, + record_type=record_type + ) async def submit_url_annotation( self, diff --git a/core/DTOs/GetNextRelevanceAnnotationResponseInfo.py b/core/DTOs/GetNextRelevanceAnnotationResponseInfo.py new file mode 100644 index 00000000..61cb35a5 --- /dev/null +++ b/core/DTOs/GetNextRelevanceAnnotationResponseInfo.py @@ -0,0 +1,22 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from collector_db.DTOs.URLMapping import URLMapping +from core.DTOs.ResponseURLInfo import ResponseURLInfo +from html_tag_collector.DataClassTags import ResponseHTMLInfo + + +class GetNextRelevanceAnnotationResponseInfo(BaseModel): + url_info: URLMapping = Field( + title="Information about the URL" + ) + suggested_relevant: Optional[bool] = Field( + title="Whether the auto-labeler identified the URL as relevant or not" + ) + html_info: ResponseHTMLInfo = Field( + title="HTML information about the URL" + ) + +class GetNextRelevanceAnnotationResponseOuterInfo(BaseModel): + next_annotation: Optional[GetNextRelevanceAnnotationResponseInfo] diff --git a/core/DTOs/ResponseURLInfo.py b/core/DTOs/ResponseURLInfo.py new file mode 100644 index 00000000..c7f7e364 --- /dev/null +++ b/core/DTOs/ResponseURLInfo.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class ResponseURLInfo(BaseModel): + url: str + url_id: int \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index d25ca424..8a7cd487 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,6 +12,8 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ + GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse @@ -172,18 +174,11 @@ def abort_batch(self, batch_id: int) -> MessageResponse: ) return MessageResponse(**data) - def process_relevancy(self) -> MessageCountResponse: - # TODO: Delete - data = self.post( - url=f"process/relevancy" - ) - return MessageCountResponse(**data) - - def get_next_relevance_annotation(self) -> GetNextURLForAnnotationResponse: + def get_next_relevance_annotation(self) -> GetNextRelevanceAnnotationResponseOuterInfo: data = self.get( url=f"/annotate/relevance" ) - return GetNextURLForAnnotationResponse(**data) + return GetNextRelevanceAnnotationResponseOuterInfo(**data) def get_next_record_type_annotation(self) -> GetNextURLForAnnotationResponse: data = self.get( @@ -204,14 +199,14 @@ def post_record_type_annotation_and_get_next( def post_relevance_annotation_and_get_next( self, - metadata_id: int, + url_id: int, relevance_annotation_post_info: RelevanceAnnotationPostInfo - ) -> GetNextURLForAnnotationResponse: + ) -> GetNextRelevanceAnnotationResponseOuterInfo: data = self.post( - url=f"/annotate/relevance/{metadata_id}", + url=f"/annotate/relevance/{url_id}", json=relevance_annotation_post_info.model_dump(mode='json') ) - return GetNextURLForAnnotationResponse(**data) + return GetNextRelevanceAnnotationResponseOuterInfo(**data) async def get_next_agency_annotation(self) -> GetNextURLForAgencyAnnotationResponse: data = self.get( diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index fe5d4c28..b90ad1cc 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -3,16 +3,32 @@ import pytest from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo +from collector_db.DTOs.URLMapping import URLMapping from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import UserUrlAgencySuggestion +from collector_db.models import UserUrlAgencySuggestion, UserRelevantSuggestion +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import RecordType, SuggestionType +from html_tag_collector.DataClassTags import ResponseHTMLInfo from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID +def check_url_mappings_match( + map_1: URLMapping, + map_2: URLMapping +): + assert map_1.url_id == map_2.url_id + assert map_2.url == map_2.url + +def check_html_info_not_empty( + html_info: ResponseHTMLInfo +): + assert html_info.description != "" + assert html_info.title != "" + async def run_annotation_test( api_test_helper, submit_and_get_next_function: callable, @@ -57,6 +73,8 @@ async def run_annotation_test( # Validate presence of HTML data in `html` field assert inner_info_1.html_info.description != "" assert inner_info_1.html_info.title != "" + + # Validate that the correct metadata value is returned assert inner_info_1.suggested_value == "False" # Call `POST` `/annotate/url` with finished annotation, and receive next URL @@ -115,28 +133,72 @@ async def test_annotate_relevancy(api_test_helper): # Add HTML data to both await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) # Call `GET` `/annotate/url` and receive next URL - request_info_1: GetNextURLForAnnotationResponse = api_test_helper.request_validator.get_next_relevance_annotation() + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() inner_info_1 = request_info_1.next_annotation - # Validate presence of HTML data in `html` field - assert inner_info_1.html_info.description != "" - assert inner_info_1.html_info.title != "" - assert inner_info_1.suggested_value == "True" + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + # Validate that the correct relevant value is returned + assert inner_info_1.suggested_relevant is True + # Annotate with value 'False' and get next URL + request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + is_relevant=False + ) + ) + inner_info_2 = request_info_2.next_annotation - await run_annotation_test( - api_test_helper=api_test_helper, - submit_and_get_next_function=api_test_helper.request_validator.post_relevance_annotation_and_get_next, - get_next_function=api_test_helper.request_validator.get_next_relevance_annotation, - post_info=RelevanceAnnotationPostInfo( + check_url_mappings_match( + inner_info_2.url_info, + url_2 + ) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( is_relevant=True - ), - metadata_attribute=URLMetadataAttributeType.RELEVANT, - expected_metadata_value="True" + ) ) + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.relevant is False + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.relevant is True + + # If user submits annotation for same URL, the URL should be overwritten + + request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + is_relevant=True + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert results[0].relevant is True + + + @pytest.mark.asyncio async def test_annotate_record_type(api_test_helper): await run_annotation_test( From 93faa1fbe925717150fe56fce905d1f7765d2528 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 25 Feb 2025 11:56:31 -0500 Subject: [PATCH 058/182] build(api): add approve source endpoint and overhaul metadata Create `/review/approve-source` endpoint Overhaul annotation backend for better maintainability BREAKING CHANGE: Annotations no longer return metadata ids, but url ids. Approval or suggested annotations must now include url ids instead of metadata ids. --- ...45b1c_add_task_tables_and_linking_logic.py | 1 - ...21086_update_metadata_validation_status.py | 16 +- ...0590bb_overhaul_annotation_organization.py | 81 ++- ...ae_add_user_url_agency_suggestions_and_.py | 6 +- ...daf0_revise_agency_identification_logic.py | 1 - api/routes/annotate.py | 15 +- collector_db/AsyncDatabaseClient.py | 663 ++++++++---------- collector_db/DTOConverter.py | 232 +++--- collector_db/StatementComposer.py | 19 +- collector_db/models.py | 77 +- collector_manager/enums.py | 6 +- core/AsyncCore.py | 63 +- core/DTOs/FinalReviewApprovalInfo.py | 4 +- ...GetNextRecordTypeAnnotationResponseInfo.py | 22 + core/DTOs/GetNextURLForFinalReviewResponse.py | 1 - core/DTOs/GetURLsResponseInfo.py | 1 - core/classes/URLRecordTypeTaskOperator.py | 27 +- .../URLRelevanceHuggingfaceTaskOperator.py | 27 +- local_database/DataDumper/dump.sh | 2 +- tests/helpers/DBDataCreator.py | 71 +- tests/helpers/complex_test_data_functions.py | 56 +- .../test_html_tag_collector_integration.py | 1 - tests/test_alembic/test_revisions.py | 7 +- .../api/helpers/RequestValidator.py | 13 +- .../integration/api/test_annotate.py | 177 ++--- .../integration/api/test_url.py | 5 - .../collector_db/test_database_structure.py | 39 -- .../collector_db/test_db_client.py | 102 +-- .../security_manager/test_security_manager.py | 5 +- .../tasks/test_agency_preannotation_task.py | 4 +- .../tasks/test_url_record_type_task.py | 10 +- .../test_url_relevancy_huggingface_task.py | 17 +- util/alembic_helpers.py | 38 + 33 files changed, 800 insertions(+), 1009 deletions(-) create mode 100644 core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py create mode 100644 util/alembic_helpers.py diff --git a/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py b/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py index b2174484..f408396f 100644 --- a/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py +++ b/alembic/versions/072b32a45b1c_add_task_tables_and_linking_logic.py @@ -72,7 +72,6 @@ def downgrade() -> None: op.drop_constraint("url_error_info_task_id_fkey", 'url_error_info', type_='foreignkey') op.drop_constraint('uq_url_id_error', 'url_error_info', type_='unique') op.drop_column('url_error_info', 'task_id') - op.drop_column('url_metadata', 'notes') op.drop_table('link_task_urls') op.drop_table('task_errors') op.drop_table('tasks') diff --git a/alembic/versions/108dac321086_update_metadata_validation_status.py b/alembic/versions/108dac321086_update_metadata_validation_status.py index 5212865a..aa05ee1b 100644 --- a/alembic/versions/108dac321086_update_metadata_validation_status.py +++ b/alembic/versions/108dac321086_update_metadata_validation_status.py @@ -43,13 +43,13 @@ def upgrade() -> None: def downgrade() -> None: validation_status.create(op.get_bind()) - - op.alter_column( - table_name="url_metadata", - column_name="validation_status", - existing_type=metadata_validation_status, - type_=validation_status, - postgresql_using="validation_status::text::validation_status" - ) + # + # op.alter_column( + # table_name="url_metadata", + # column_name="validation_status", + # existing_type=metadata_validation_status, + # type_=validation_status, + # postgresql_using="validation_status::text::validation_status" + # ) metadata_validation_status.drop(op.get_bind(), checkfirst=True) diff --git a/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py b/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py index 4b453174..55442f50 100644 --- a/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py +++ b/alembic/versions/2025_02_23_1023-33421c0590bb_overhaul_annotation_organization.py @@ -16,6 +16,11 @@ - `ConfirmedURLAgency` - `MetadataAnnotation` +Update URL Status to just three enum value: +- VALIDATED +- SUBMITTED +- PENDING + Revision ID: 33421c0590bb Revises: 0c6dc00806ce Create Date: 2025-02-23 10:23:19.696248 @@ -27,7 +32,7 @@ import sqlalchemy as sa from sqlalchemy import UniqueConstraint -from core.enums import RecordType +from util.alembic_helpers import switch_enum_type # revision identifiers, used by Alembic. revision: str = '33421c0590bb' @@ -77,11 +82,50 @@ record_type_enum = sa.Enum(*record_type_values, name='record_type') +def run_data_migrations(): + + op.execute( + """ + INSERT INTO AUTO_RELEVANT_SUGGESTIONS (url_id, relevant) + SELECT url_id, LOWER(value)::boolean + FROM public.url_metadata + WHERE validation_source = 'Machine Learning' + and attribute = 'Relevant' + """ + ) + + op.execute( + """ + INSERT INTO AUTO_RECORD_TYPE_SUGGESTIONS(url_id, record_type) + SELECT url_id, value::record_type + FROM public.url_metadata + WHERE validation_source = 'Machine Learning' + and attribute = 'Record Type' + """ + ) + + op.execute( + """ + INSERT INTO USER_RELEVANT_SUGGESTIONS(url_id, relevant, user_id) + SELECT um.url_id, LOWER(um.value)::boolean, ma.user_id + FROM public.url_metadata um + INNER join metadata_annotations ma on um.id = ma.metadata_id + where um.attribute = 'Relevant' + """ + ) + + op.execute( + """ + INSERT INTO USER_RECORD_TYPE_SUGGESTIONS(url_id, record_type, user_id) + SELECT um.url_id, um.value::record_type, ma.user_id + FROM public.url_metadata um + INNER join metadata_annotations ma on um.id = ma.metadata_id + where um.attribute = 'Record Type' + + """ + ) + def upgrade() -> None: - # Delete the old tables - op.drop_table('metadata_annotations') - op.drop_table('url_metadata') - op.drop_table('confirmed_url_agency') # Create the new tables op.create_table( @@ -168,6 +212,21 @@ def upgrade() -> None: ) ) + run_data_migrations() + + # Delete the old tables + op.drop_table('metadata_annotations') + op.drop_table('url_metadata') + op.drop_table('confirmed_url_agency') + + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=['pending', 'submitted', 'validated', 'error', 'duplicate'] + ) + + @@ -214,7 +273,7 @@ def downgrade() -> None: op.create_table( 'metadata_annotations', sa.Column('id', sa.Integer(), primary_key=True), - sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column('metadata_id', sa.Integer(), sa.ForeignKey('url_metadata.id', ondelete='CASCADE'), nullable=False), sa.Column('user_id', sa.Integer(), nullable=False), sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), @@ -223,3 +282,13 @@ def downgrade() -> None: "metadata_id", name="metadata_annotations_uq_user_id_metadata_id"), ) + + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=['pending', 'submitted', 'human_labeling', 'rejected', 'duplicate', 'error'] + ) + + # Drop enum + record_type_enum.drop(op.get_bind()) diff --git a/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py b/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py index 8eadb6a3..87c069fa 100644 --- a/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py +++ b/alembic/versions/8c44e02733ae_add_user_url_agency_suggestions_and_.py @@ -51,13 +51,9 @@ def upgrade() -> None: def downgrade() -> None: - op.drop_column( - table_name='url_agency_suggestions', - column_name="user_id" - ) + op.execute("DROP TRIGGER IF EXISTS enforce_url_agency_suggestions_manual_suggestion_user_id ON url_agency_suggestions;") op.execute( """ - DROP TRIGGER IF EXISTS enforce_url_agency_suggestions_manual_suggestion_user_id; DROP FUNCTION IF EXISTS user_url_agency_suggestions_value(); """ ) diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index 62d9930d..2bb7c157 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -146,6 +146,5 @@ def downgrade(): op.execute(""" DROP FUNCTION IF EXISTS enforce_no_agency_id_if_unknown; """) - op.execute("DROP TRIGGER enforce_no_agency_id_if_new ON user_url_agency_suggestions") op.execute("DROP FUNCTION enforce_no_agency_id_if_new()") diff --git a/api/routes/annotate.py b/api/routes/annotate.py index d5f2e709..53486d7d 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -3,6 +3,7 @@ from api.dependencies import get_async_core from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore +from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -53,10 +54,9 @@ async def annotate_url_for_relevance_and_get_next_url( async def get_next_url_for_record_type_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAnnotationResponse: - result = await async_core.get_next_url_for_annotation( +) -> GetNextRecordTypeAnnotationResponseOuterInfo: + result = await async_core.get_next_url_for_record_type_annotation( user_id=access_info.user_id, - metadata_type=URLMetadataAttributeType.RECORD_TYPE ) return result @@ -66,15 +66,14 @@ async def annotate_url_for_record_type_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) -) -> GetNextURLForAnnotationResponse: +) -> GetNextRecordTypeAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate """ - result = await async_core.submit_and_get_next_url_for_annotation( + result = await async_core.submit_url_record_type_annotation( user_id=access_info.user_id, - metadata_id=metadata_id, - annotation=record_type_annotation_post_info.record_type.value, - metadata_type=URLMetadataAttributeType.RECORD_TYPE + url_id=url_id, + record_type=record_type_annotation_post_info.record_type, ) return result diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 3d34f024..cf7f36cb 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,41 +1,36 @@ from functools import wraps -from typing import Optional, List +from typing import Optional, Type from fastapi import HTTPException -from sqlalchemy import select, exists, func, distinct, case, desc, asc, Select, not_, and_ +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload, aliased, joinedload, Mapped, QueryableAttribute +from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute from starlette import status from collector_db.ConfigManager import ConfigManager from collector_db.DTOConverter import DTOConverter -from collector_db.DTOs.MetadataAnnotationInfo import MetadataAnnotationInfo from collector_db.DTOs.TaskInfo import TaskInfo -from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.StatementComposer import StatementComposer -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType +from collector_db.enums import URLMetadataAttributeType, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ - GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse, URLAgencyAnnotationPostInfo -from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ - FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyInfo + GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo -from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ +from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo -from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo -from core.DTOs.ResponseURLInfo import ResponseURLInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.enums import BatchStatus, SuggestionType, RecordType @@ -85,23 +80,6 @@ async def wrapper(self, *args, **kwargs): return wrapper - @session_manager - async def get_url_metadata_by_status( - self, - session: AsyncSession, - url_status: URLStatus, - offset: int = 0 - ): - statement = (select(URLMetadata) - .join(URL) - .where(URL.outcome == url_status.value) - .limit(100) - .offset(offset) - .order_by(URLMetadata.id)) - scalar_result = await session.scalars(statement) - model_result = scalar_result.all() - return [URLMetadataInfo(**url_metadata.__dict__) for url_metadata in model_result] - # region relevant @session_manager async def add_auto_relevant_suggestion( @@ -143,10 +121,8 @@ async def get_next_url_for_user_annotation( select( URL, ) - # TODO: Generalize this whole section .where(exists(select(URLHTMLContent).where(URLHTMLContent.url_id == URL.id))) # URL must not have metadata annotation by this user - # TODO: Have this as a parameter for the user model .where( not_( exists( @@ -157,7 +133,6 @@ async def get_next_url_for_user_annotation( ) ) ) - # TODO: Parameterize relationship attribute to joinedload ).options( joinedload(auto_suggestion_relationship), joinedload(URL.html_content) @@ -204,7 +179,7 @@ async def get_next_url_for_relevance_annotation( url = await self.get_next_url_for_user_annotation( session, user_suggestion_model_to_exclude=UserRelevantSuggestion, - auto_suggestion_relationship=URL.auto_relevant_suggestions, + auto_suggestion_relationship=URL.auto_relevant_suggestion, user_id=user_id ) if url is None: @@ -215,18 +190,17 @@ async def get_next_url_for_relevance_annotation( url.html_content ) - # Get auto-suggestion if exists - if len(url.auto_relevant_suggestions) > 0: - auto_suggestion = url.auto_relevant_suggestions[0].relevant + if url.auto_relevant_suggestion is not None: + suggestion = url.auto_relevant_suggestion.relevant else: - auto_suggestion = None + suggestion = None return GetNextRelevanceAnnotationResponseInfo( url_info=URLMapping( url=url.url, url_id=url.id ), - suggested_relevant=auto_suggestion, + suggested_relevant=suggestion, html_info=html_response_info ) @@ -239,9 +213,49 @@ async def get_next_url_for_record_type_annotation( self, session: AsyncSession, user_id: int - ) = : + ) -> Optional[GetNextRecordTypeAnnotationResponseInfo]: + + url = await self.get_next_url_for_user_annotation( + session, + user_suggestion_model_to_exclude=UserRecordTypeSuggestion, + auto_suggestion_relationship=URL.auto_record_type_suggestion, + user_id=user_id + ) + if url is None: + return None + + # Next, get all HTML content for the URL + html_response_info = DTOConverter.html_content_list_to_html_response_info( + url.html_content + ) + + if url.auto_record_type_suggestion is not None: + suggestion = url.auto_record_type_suggestion.record_type + else: + suggestion = None + + return GetNextRecordTypeAnnotationResponseInfo( + url_info=URLMapping( + url=url.url, + url_id=url.id + ), + suggested_record_type=suggestion, + html_info=html_response_info + ) + @session_manager + async def add_auto_record_type_suggestions( + self, + session: AsyncSession, + url_and_record_type_list: list[tuple[int, RecordType]] + ): + for url_id, record_type in url_and_record_type_list: + suggestion = AutoRecordTypeSuggestion( + url_id=url_id, + record_type=record_type.value + ) + session.add(suggestion) @session_manager async def add_auto_record_type_suggestion( @@ -250,12 +264,26 @@ async def add_auto_record_type_suggestion( url_id: int, record_type: RecordType ): + suggestion = AutoRecordTypeSuggestion( url_id=url_id, record_type=record_type.value ) session.add(suggestion) + @session_manager + async def add_auto_relevance_suggestions( + self, + session: AsyncSession, + url_and_relevance_type_list: list[tuple[int, bool]] + ): + for url_id, relevant in url_and_relevance_type_list: + suggestion = AutoRelevantSuggestion( + url_id=url_id, + relevant=relevant + ) + session.add(suggestion) + @session_manager async def add_user_record_type_suggestion( self, @@ -264,6 +292,16 @@ async def add_user_record_type_suggestion( user_id: int, record_type: RecordType ): + prior_suggestion = await self.get_user_suggestion( + session, + model=UserRecordTypeSuggestion, + user_id=user_id, + url_id=url_id + ) + if prior_suggestion is not None: + prior_suggestion.record_type = record_type.value + return + suggestion = UserRecordTypeSuggestion( url_id=url_id, user_id=user_id, @@ -273,15 +311,6 @@ async def add_user_record_type_suggestion( #endregion record_type - @session_manager - async def add_url_metadata(self, session: AsyncSession, url_metadata_info: URLMetadataInfo) -> int: - result = await self._add_models(session, URLMetadata, [url_metadata_info]) - return result[0] - - @session_manager - async def add_url_metadatas(self, session: AsyncSession, url_metadata_infos: list[URLMetadataInfo]) -> list[int]: - return await self._add_models(session, URLMetadata, url_metadata_infos) - @session_manager async def add_url_error_infos(self, session: AsyncSession, url_error_infos: list[URLErrorPydanticInfo]): for url_error_info in url_error_infos: @@ -331,6 +360,79 @@ async def get_pending_urls_without_html_data(self, session: AsyncSession): scalar_result = await session.scalars(statement) return scalar_result.all() + async def get_urls_with_html_data_and_without_models( + self, + session: AsyncSession, + model: Type[Base] + ): + statement = (select(URL) + .options(selectinload(URL.html_content)) + .where(URL.outcome == URLStatus.PENDING.value)) + statement = self.statement_composer.exclude_urls_with_extant_model( + statement=statement, + model=model + ) + statement = statement.limit(100).order_by(URL.id) + raw_result = await session.execute(statement) + urls: list[URL] = raw_result.unique().scalars().all() + final_results = DTOConverter.url_list_to_url_with_html_list(urls) + + return final_results + + @session_manager + async def get_urls_with_html_data_and_without_auto_record_type_suggestion( + self, + session: AsyncSession + ): + return await self.get_urls_with_html_data_and_without_models( + session=session, + model=AutoRecordTypeSuggestion + ) + + @session_manager + async def get_urls_with_html_data_and_without_auto_relevant_suggestion( + self, + session: AsyncSession + ): + return await self.get_urls_with_html_data_and_without_models( + session=session, + model=AutoRelevantSuggestion + ) + + async def has_urls_with_html_data_and_without_models( + self, + session: AsyncSession, + model: Type[Base] + ) -> bool: + statement = (select(URL) + .join(URLHTMLContent) + .where(URL.outcome == URLStatus.PENDING.value)) + # Exclude URLs with auto suggested record types + statement = self.statement_composer.exclude_urls_with_extant_model( + statement=statement, + model=model + ) + statement = statement.limit(1) + scalar_result = await session.scalars(statement) + return bool(scalar_result.first()) + + + @session_manager + async def has_urls_with_html_data_and_without_auto_record_type_suggestion(self, session: AsyncSession) -> bool: + return await self.has_urls_with_html_data_and_without_models( + session=session, + model=AutoRecordTypeSuggestion + ) + + @session_manager + async def has_urls_with_html_data_and_without_auto_relevant_suggestion(self, session: AsyncSession) -> bool: + return await self.has_urls_with_html_data_and_without_models( + session=session, + model=AutoRelevantSuggestion + ) + + + #TODO: Slated for deletion @session_manager async def get_urls_with_html_data_and_without_metadata_type( self, @@ -339,13 +441,25 @@ async def get_urls_with_html_data_and_without_metadata_type( ) -> list[URLWithHTML]: # Get URLs with no relevancy metadata - statement = (select(URL.id, URL.url, URLHTMLContent). - join(URLHTMLContent). - where(URL.outcome == URLStatus.PENDING.value)) + statement = (select(URL) + .options(selectinload(URL.html_content)) + .where(URL.outcome == URLStatus.PENDING.value)) + # Exclude URLs with auto suggested record types + statement = self.statement_composer.exclude_urls_with_extant_model( + statement=statement, + model=AutoRecordTypeSuggestion + ) + statement = statement.limit(100).order_by(URL.id) + + + # TODO: The below can probably be generalized + + statement = self.statement_composer.exclude_urls_with_select_metadata( statement=statement, attribute=without_metadata_type ) + # TODO: Generalize statement = statement.limit(100).order_by(URL.id) raw_result = await session.execute(statement) result = raw_result.all() @@ -388,129 +502,7 @@ async def has_pending_urls_with_html_data_and_without_metadata_type( result = raw_result.all() return len(result) > 0 - @session_manager - async def get_urls_with_metadata( - self, - session: AsyncSession, - attribute: URLMetadataAttributeType, - validation_status: ValidationStatus, - offset: int = 0 - ) -> list[URLMetadataInfo]: - statement = (select(URL.id, URLMetadata.id). - join(URLMetadata). - where(URLMetadata.attribute == attribute.value). - where(URLMetadata.validation_status == validation_status.value). - limit(100). - offset(offset). - order_by(URL.id) - ) - - raw_result = await session.execute(statement) - result = raw_result.all() - final_results = [] - for url_id, url_metadata_id in result: - info = URLMetadataInfo( - url_id=url_id, - id=url_metadata_id, - ) - final_results.append(info) - - return final_results - - @session_manager - async def update_url_metadata_status(self, session: AsyncSession, metadata_ids: list[int], - validation_status: ValidationStatus): - for metadata_id in metadata_ids: - statement = select(URLMetadata).where(URLMetadata.id == metadata_id) - scalar_result = await session.scalars(statement) - url_metadata = scalar_result.first() - url_metadata.validation_status = validation_status - - @session_manager - async def get_next_url_for_annotation( - self, - session: AsyncSession, - user_id: int, - metadata_type: URLMetadataAttributeType - ) -> URLAnnotationInfo: - # Get a URL, its relevancy metadata ID, and HTML data - # For a URL which has not yet been annotated by this user id - # First, subquery retrieving URL and its metadata ID where its relevant metadata - # does not have an annotation for that user - subquery = ( - select( - URL.id.label("url_id"), - URL.url, - URLMetadata.id.label("metadata_id"), - URLMetadata.value, - ) - .join(URLMetadata) - # Metadata must be relevant - .where(URLMetadata.attribute == metadata_type.value) - # Metadata must not be validated - .where(URLMetadata.validation_status == ValidationStatus.PENDING_VALIDATION.value) - # URL must have HTML content entries - .where(exists(select(URLHTMLContent).where(URLHTMLContent.url_id == URL.id))) - # URL must not have been annotated by the user - .where(~exists( - select(MetadataAnnotation). - where( - MetadataAnnotation.metadata_id == URLMetadata.id, - MetadataAnnotation.user_id == user_id - ) - )) - .limit(1) - ) - - # Next, get all HTML content for the URL - - statement = ( - select( - subquery.c.url, - subquery.c.metadata_id, - subquery.c.value, - URLHTMLContent.content_type, - URLHTMLContent.content, - ) - .join(URLHTMLContent) - .where(subquery.c.url_id == URLHTMLContent.url_id) - ) - - raw_result = await session.execute(statement) - result = raw_result.all() - - if len(result) == 0: - # No available URLs to annotate - return None - - annotation_info = URLAnnotationInfo( - url=result[0][0], - metadata_id=result[0][1], - suggested_value=result[0][2], - html_infos=[] - ) - for _, _, _, content_type, content in result: - html_info = URLHTMLContentInfo( - content_type=content_type, - content=content - ) - annotation_info.html_infos.append(html_info) - return annotation_info - @session_manager - async def add_metadata_annotation( - self, - session: AsyncSession, - user_id: int, - metadata_id: int, - annotation: str - ): - annotation = MetadataAnnotation( - metadata_id=metadata_id, - user_id=user_id, - value=annotation - ) - session.add(annotation) # @session_manager # async def get_annotations_for_metadata_id( @@ -552,7 +544,7 @@ async def add_to_root_url_cache(self, session: AsyncSession, url: str, page_titl @session_manager async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetURLsResponseInfo: statement = select(URL).options( - selectinload(URL.url_metadata), selectinload(URL.error_info) + selectinload(URL.error_info) ) if errors: # Only return URLs with errors @@ -566,18 +558,6 @@ async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetU all_results = execute_result.scalars().all() final_results = [] for result in all_results: - metadata_results = [] - for metadata in result.url_metadata: - metadata_result = GetURLsResponseMetadataInfo( - id=metadata.id, - attribute=URLMetadataAttributeType(metadata.attribute), - value=metadata.value, - validation_status=ValidationStatus(metadata.validation_status), - validation_source=ValidationSource(metadata.validation_source), - created_at=metadata.created_at, - updated_at=metadata.updated_at - ) - metadata_results.append(metadata_result) error_results = [] for error in result.error_info: error_result = GetURLsResponseErrorInfo( @@ -596,7 +576,6 @@ async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetU updated_at=result.updated_at, created_at=result.created_at, errors=error_results, - metadata=metadata_results ) ) @@ -765,7 +744,7 @@ async def has_urls_without_agency_suggestions( statement = ( select( URL.id - )) + ).where(URL.agency_id == None)) statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) raw_result = await session.execute(statement) result = raw_result.all() @@ -812,13 +791,11 @@ async def get_next_url_agency_for_annotation( # Select statement statement = ( select(URL.id, URL.url) - # Must not be a confirmed URL - .join(ConfirmedUrlAgency, isouter=True) + # Must not have a confirmed agency identifier. .where( - ~exists( - select(ConfirmedUrlAgency). - where(ConfirmedUrlAgency.url_id == URL.id). - correlate(URL) + and_( + URL.agency_id.is_(None), + URL.outcome == URLStatus.PENDING.value ) ) # Must not have been annotated by this user @@ -924,11 +901,9 @@ async def add_confirmed_agency_url_links( suggestions: list[URLAgencySuggestionInfo] ): for suggestion in suggestions: - confirmed_agency_url_link = ConfirmedUrlAgency( - agency_id=suggestion.pdap_agency_id, - url_id=suggestion.url_id - ) - session.add(confirmed_agency_url_link) + url = await session.execute(select(URL).where(URL.id == suggestion.url_id)) + url = url.scalar_one() + url.agency_id = suggestion.pdap_agency_id @session_manager async def add_agency_auto_suggestions( @@ -965,88 +940,74 @@ async def add_agency_manual_suggestion( ) session.add(url_agency_suggestion) + @session_manager + async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: + statement = select(URL).where(URL.agency_id != None) + results = await session.execute(statement) + return list(results.scalars().all()) + @session_manager async def get_next_url_for_final_review( self, session: AsyncSession ) -> Optional[GetNextURLForFinalReviewResponse]: - # Subqueries for ORDER clause - # Subqueries for Counting distinct annotations - # Count distinct auto annotations for metadata - distinct_auto_metadata_subquery = ( - select( - URLMetadata.url_id, - func.count(distinct(URLMetadata.attribute)).label("auto_count") - ). - group_by(URLMetadata.url_id).subquery() - ) - # Count distinct user annotations for metadata - distinct_user_metadata_subquery = ( - select( - URLMetadata.url_id, - func.count(distinct(URLMetadata.attribute)).label("user_count") - ).join(MetadataAnnotation). - where(MetadataAnnotation.user_id != None). - group_by(URLMetadata.url_id).subquery() - ) + def annotations_exist_subquery(model: Type[Base]): + return ( + select( + URL.id.label("url_id"), + case( + ( + exists().where(URL.id == model.url_id), 1 + ), + else_=0 + ).label("exists") + ).subquery() + ) - # Count whether agency auto annotations exist - # (Note: Can be either confirmed or auto suggestion) - agency_annotations_exist_subquery = ( - select( - URL.id, - case( - ( - exists().where(URL.id == ConfirmedUrlAgency.url_id), 1 - ), - ( - exists().where(URL.id == AutomatedUrlAgencySuggestion.url_id), 1 - ), - else_=0 - ).label("agency_annotations_exist") - ).subquery() - ) + def count_subquery(model: Type[Base]): + return ( + select( + model.url_id, + func.count(model.url_id).label("count") + ).group_by(model.url_id).subquery() + ) - # Count whether agency user annotations exist - agency_user_annotations_exist_subquery = ( - select( - URL.id, - case( - ( - exists().where(URL.id == UserUrlAgencySuggestion.url_id), 1 - ), - else_=0 - ).label("agency_user_annotations_exist") - ).subquery() - ) + models = [ + AutoRelevantSuggestion, + UserRelevantSuggestion, + AutoRecordTypeSuggestion, + UserRecordTypeSuggestion, + AutomatedUrlAgencySuggestion, + UserUrlAgencySuggestion + ] - # Subqueries for counting *all* annotations + exist_subqueries = [ + annotations_exist_subquery(model=model) + for model in models + ] - # Count all auto annotations for metadata - all_auto_metadata_subquery = ( - select( - URLMetadata.url_id, - func.count(URLMetadata.attribute).label("auto_count") - ).group_by(URLMetadata.url_id).subquery() - ) - # Count all user annotations for metadata - all_user_metadata_subquery = ( - select( - URLMetadata.url_id, - func.count(URLMetadata.attribute).label("user_count") - ).join(MetadataAnnotation). - where(MetadataAnnotation.user_id != None). - group_by(URLMetadata.url_id).subquery() + sum_of_exist_subqueries = ( + sum( + [ + subquery.c.exists + for subquery in exist_subqueries] + ) ) - # Count all user agency annotations - all_user_agency_annotations_subquery = ( - select( - UserUrlAgencySuggestion.url_id, - func.count(UserUrlAgencySuggestion.agency_id).label("user_count") - ).group_by(UserUrlAgencySuggestion.url_id).subquery() + count_subqueries = [ + count_subquery(model=model) + for model in models + ] + + sum_of_count_subqueries = ( + sum( + [ + subquery.c.count + for subquery in count_subqueries + ] + ) ) # Basic URL query @@ -1054,43 +1015,43 @@ async def get_next_url_for_final_review( select( URL, ( - func.coalesce(distinct_auto_metadata_subquery.c.auto_count, 0) + - func.coalesce(distinct_user_metadata_subquery.c.user_count, 0) + - func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + - func.coalesce(agency_user_annotations_exist_subquery.c.agency_user_annotations_exist, 0) + sum_of_exist_subqueries ).label("total_distinct_annotation_count"), ( - func.coalesce(all_auto_metadata_subquery.c.auto_count, 0) + - func.coalesce(all_user_metadata_subquery.c.user_count, 0) + - func.coalesce(all_user_agency_annotations_subquery.c.user_count, 0) + - func.coalesce(agency_annotations_exist_subquery.c.agency_annotations_exist, 0) + sum_of_count_subqueries ).label("total_overall_annotation_count") - ).outerjoin( - distinct_auto_metadata_subquery, URL.id == distinct_auto_metadata_subquery.c.url_id - ).outerjoin( - distinct_user_metadata_subquery, URL.id == distinct_user_metadata_subquery.c.url_id - ).outerjoin( - agency_annotations_exist_subquery, URL.id == agency_annotations_exist_subquery.c.id - ).outerjoin( - agency_user_annotations_exist_subquery, URL.id == agency_user_annotations_exist_subquery.c.id - ).outerjoin( - all_auto_metadata_subquery, URL.id == all_auto_metadata_subquery.c.url_id - ).outerjoin( - all_user_metadata_subquery, URL.id == all_user_metadata_subquery.c.url_id - ).outerjoin( - all_user_agency_annotations_subquery, URL.id == all_user_agency_annotations_subquery.c.url_id - ).where( - URL.outcome == URLStatus.PENDING.value ) ) + + for subquery in (exist_subqueries + count_subqueries): + url_query = url_query.outerjoin( + subquery, URL.id == subquery.c.url_id + ) + + url_query = url_query.where( + URL.outcome == URLStatus.PENDING.value + ) + + single_join_relationships = [ + URL.agency, + URL.html_content, + URL.auto_record_type_suggestion, + URL.auto_relevant_suggestion, + URL.user_relevant_suggestions, + URL.user_record_type_suggestions, + ] + options = [ - joinedload(URL.html_content), - joinedload(URL.url_metadata).joinedload(URLMetadata.annotations), - joinedload(URL.automated_agency_suggestions).joinedload(AutomatedUrlAgencySuggestion.agency), - joinedload(URL.user_agency_suggestions).joinedload(UserUrlAgencySuggestion.agency), - joinedload(URL.confirmed_agencies).joinedload(ConfirmedUrlAgency.agency), + joinedload(relationship) for relationship in single_join_relationships ] + double_join_relationships = [ + (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), + (URL.user_agency_suggestions, UserUrlAgencySuggestion.agency) + ] + for primary, secondary in double_join_relationships: + options.append(joinedload(primary).joinedload(secondary)) + # Apply options url_query = url_query.options(*options) @@ -1116,23 +1077,24 @@ async def get_next_url_for_final_review( html_content = result.html_content html_content_infos = [URLHTMLContentInfo(**html_info.__dict__) for html_info in html_content] - automated_agency_suggestions = result.automated_agency_suggestions - user_agency_suggestions = result.user_agency_suggestions - confirmed_agencies = result.confirmed_agencies - url_metadatas = result.url_metadata - # Return return GetNextURLForFinalReviewResponse( id=result.id, url=result.url, html_info=convert_to_response_html_info(html_content_infos), annotations=FinalReviewAnnotationInfo( - relevant=DTOConverter.final_review_annotation_relevant_info(url_metadatas), - record_type=DTOConverter.final_review_annotation_record_type_info(url_metadatas), + relevant=DTOConverter.final_review_annotation_relevant_info( + user_suggestions=result.user_relevant_suggestions, + auto_suggestion=result.auto_relevant_suggestion + ), + record_type=DTOConverter.final_review_annotation_record_type_info( + user_suggestions=result.user_record_type_suggestions, + auto_suggestion=result.auto_record_type_suggestion + ), agency=DTOConverter.final_review_annotation_agency_info( - automated_agency_suggestions=automated_agency_suggestions, - confirmed_agencies=confirmed_agencies, - user_agency_suggestions=user_agency_suggestions + automated_agency_suggestions=result.automated_agency_suggestions, + user_agency_suggestions=result.user_agency_suggestions, + confirmed_agency=result.agency ) ) ) @@ -1142,96 +1104,33 @@ async def approve_url( self, session: AsyncSession, url_id: int, - record_type: Optional[RecordType] = None, - relevant: Optional[bool] = None, + record_type: RecordType, + relevant: bool, agency_id: Optional[int] = None ) -> None: - async def set_approved_metadata( - attribute: URLMetadataAttributeType, - value: Optional[str] - ): - selected_metadata = None - for metadata in metadatas: - if metadata.attribute == attribute.value: - selected_metadata = metadata - break - - # If metadata doesn't exist, create it - if selected_metadata is None: - # If a value was not provided for this metadata, raise an error. - if value is None: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Must specify {attribute.value} value if URL does not already have a {attribute.value} metadata entry" - ) - - metadata_obj = URLMetadata( - attribute=attribute.value, - value=value, - validation_status=ValidationStatus.VALIDATED.value, - validation_source=ValidationSource.MANUAL.value, - url_id=url_id - ) - url.url_metadata.append(metadata_obj) - - else: - - # If value was provided, overwrite existing value. Otherwise, ignore - if value is not None: - selected_metadata.value = value - - # Mark metadata as validated - selected_metadata.validation_status = ValidationStatus.VALIDATED.value - selected_metadata.validation_source = ValidationSource.MANUAL.value - # Get URL query = ( Select(URL) .where(URL.id == url_id) - .options( - selectinload(URL.url_metadata), - selectinload(URL.confirmed_agencies) - ) ) url = await session.execute(query) url = url.scalars().first() - metadatas = url.url_metadata - - await set_approved_metadata( - attribute=URLMetadataAttributeType.RECORD_TYPE, - value=record_type - ) + url.record_type = record_type.value + url.relevant = relevant - await set_approved_metadata( - attribute=URLMetadataAttributeType.RELEVANT, - value=relevant - ) - - # Check if agency_id exists as confirmed agency - confirmed_agency = url.confirmed_agencies[0] if len(url.confirmed_agencies) > 0 else None - - # If it doesn't, create it - if confirmed_agency is None: - if agency_id is None: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="Must specify agency_id if URL does not already have a confirmed agency" - ) - - confirmed_agency = ConfirmedUrlAgency( - agency_id=agency_id, - url_id=url_id + if url.agency_id is None and agency_id is None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Must specify agency_id if URL does not already have a confirmed agency" ) - url.confirmed_agencies.append(confirmed_agency) - # If a different agency exists as confirmed, overwrite it - elif confirmed_agency.agency_id != agency_id and agency_id is not None: - confirmed_agency.agency_id = agency_id + if url.agency_id != agency_id and agency_id is not None: + url.agency_id = agency_id # If it does, do nothing - url.outcome = URLStatus.APPROVED.value + url.outcome = URLStatus.VALIDATED.value diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index 9b2b0d24..6bf9a967 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -1,6 +1,10 @@ -from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType +from typing import Optional + +from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType, URLHTMLContentInfo +from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType -from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent +from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent, URL, Agency, \ + AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ @@ -8,91 +12,58 @@ from core.enums import RecordType, SuggestionType from html_tag_collector.DataClassTags import convert_to_response_html_info, ResponseHTMLInfo, ENUM_TO_ATTRIBUTE_MAPPING - -# -# def get_url_metadata( -# url_metadatas: list[URLMetadata], -# validation_status: ValidationStatus, -# validation_source: ValidationSource, -# attribute: URLMetadataAttributeType -# ): -# for url_metadata in url_metadatas: -# if url_metadata.validation_status != validation_status.value: -# continue -# if url_metadata.validation_source != validation_source.value: -# continue -# if url_metadata.attribute != attribute.value: -# continue -# return url_metadata -# - - class DTOConverter: """ Converts SQLAlchemy objects to DTOs """ - # - # @staticmethod - # def final_review_annotation_relevant_info( - # url_metadatas: list[URLMetadata] - # ) -> FinalReviewAnnotationRelevantInfo: - # relevant_metadata = get_url_metadata( - # url_metadatas=url_metadatas, - # validation_status=ValidationStatus.PENDING_VALIDATION, - # validation_source=ValidationSource.MACHINE_LEARNING, - # attribute=URLMetadataAttributeType.RELEVANT - # ) - # auto_value = relevant_metadata.value if relevant_metadata else None - # if auto_value is not None: - # auto_value = (auto_value == "True") - # - # - # annotations: list[MetadataAnnotation] = relevant_metadata.annotations if relevant_metadata else [] - # relevant_count = 0 - # not_relevant_count = 0 - # for annotation in annotations: - # if annotation.value == "True": - # relevant_count += 1 - # else: - # not_relevant_count += 1 - # return FinalReviewAnnotationRelevantInfo( - # auto=auto_value, - # users=FinalReviewAnnotationRelevantUsersInfo( - # relevant=relevant_count, - # not_relevant=not_relevant_count - # ) - # ) - # - # @staticmethod - # def final_review_annotation_record_type_info( - # url_metadata: list[URLMetadata] - # ): - # record_type_metadata = get_url_metadata( - # url_metadatas=url_metadata, - # validation_status=ValidationStatus.PENDING_VALIDATION, - # validation_source=ValidationSource.MACHINE_LEARNING, - # attribute=URLMetadataAttributeType.RECORD_TYPE - # ) - # user_count = {} - # if record_type_metadata is None: - # auto_value = None - # annotations = [] - # else: - # auto_value = RecordType(record_type_metadata.value) - # annotations = record_type_metadata.annotations - # for annotation in annotations: - # value = RecordType(annotation.value) - # if value not in user_count: - # user_count[value] = 0 - # user_count[value] += 1 - # # Sort users by count, descending - # user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) - # - # return FinalReviewAnnotationRecordTypeInfo( - # auto=auto_value, - # users=user_count - # ) + + @staticmethod + def final_review_annotation_relevant_info( + user_suggestions: list[UserRelevantSuggestion], + auto_suggestion: AutoRelevantSuggestion + ) -> FinalReviewAnnotationRelevantInfo: + + auto_value = auto_suggestion.relevant if auto_suggestion else None + + relevant_count = 0 + not_relevant_count = 0 + for suggestion in user_suggestions: + if suggestion.relevant: + relevant_count += 1 + else: + not_relevant_count += 1 + return FinalReviewAnnotationRelevantInfo( + auto=auto_value, + users=FinalReviewAnnotationRelevantUsersInfo( + relevant=relevant_count, + not_relevant=not_relevant_count + ) + ) + + @staticmethod + def final_review_annotation_record_type_info( + user_suggestions: list[UserRecordTypeSuggestion], + auto_suggestion: AutoRecordTypeSuggestion + ): + + user_count = {} + if auto_suggestion is None: + auto_value = None + else: + auto_value = RecordType(auto_suggestion.record_type) + for suggestion in user_suggestions: + value = RecordType(suggestion.record_type) + if value not in user_count: + user_count[value] = 0 + user_count[value] += 1 + # Sort users by count, descending + user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) + + return FinalReviewAnnotationRecordTypeInfo( + auto=auto_value, + users=user_count + ) @staticmethod def final_review_annotation_agency_auto_info( @@ -105,7 +76,6 @@ def final_review_annotation_agency_auto_info( suggestions=[] ) - if len(automated_agency_suggestions) == 1: suggestion = automated_agency_suggestions[0] unknown = suggestion.is_unknown @@ -160,44 +130,64 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( # Return sorted return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) - # - # @staticmethod - # def final_review_annotation_agency_info( - # automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - # confirmed_agencies: list[ConfirmedUrlAgency], - # user_agency_suggestions: list[UserUrlAgencySuggestion] - # ): - # if len(confirmed_agencies) == 1: - # confirmed_agency = confirmed_agencies[0] - # confirmed_agency_info = GetNextURLForAgencyAgencyInfo( - # suggestion_type=SuggestionType.CONFIRMED, - # pdap_agency_id=confirmed_agency.agency_id, - # agency_name=confirmed_agency.agency.name, - # state=confirmed_agency.agency.state, - # county=confirmed_agency.agency.county, - # locality=confirmed_agency.agency.locality - # ) - # return FinalReviewAnnotationAgencyInfo( - # confirmed=confirmed_agency_info, - # users=None, - # auto=None - # ) - # - # - # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - # automated_agency_suggestions - # ) - # - # agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - # user_agency_suggestions - # ) - # - # return FinalReviewAnnotationAgencyInfo( - # confirmed=None, - # users=agency_user_info, - # auto=agency_auto_info - # ) - # + + @staticmethod + def final_review_annotation_agency_info( + automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], + confirmed_agency: Optional[Agency], + user_agency_suggestions: list[UserUrlAgencySuggestion] + ): + if confirmed_agency is not None: + confirmed_agency_info = GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=confirmed_agency.agency_id, + agency_name=confirmed_agency.name, + state=confirmed_agency.state, + county=confirmed_agency.county, + locality=confirmed_agency.locality + ) + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + users=None, + auto=None + ) + + agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + automated_agency_suggestions + ) + + agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestions + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=None, + users=agency_user_info, + auto=agency_auto_info + ) + + + @staticmethod + def url_list_to_url_with_html_list(url_list: list[URL]) -> list[URLWithHTML]: + return [DTOConverter.url_to_url_with_html(url) for url in url_list] + + @staticmethod + def url_to_url_with_html(url: URL) -> URLWithHTML: + url_val = url.url + url_id = url.id + html_infos = [] + for html_info in url.html_content: + html_infos.append( + URLHTMLContentInfo( + **html_info.__dict__ + ) + ) + + return URLWithHTML( + url=url_val, + url_id=url_id, + html_infos=html_infos + ) @staticmethod def html_content_list_to_html_response_info(html_content_list: list[URLHTMLContent]): diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index b3ab5312..d69dd078 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,3 +1,4 @@ +from typing import Any from sqlalchemy import Select, select, exists, Table, func, Subquery from sqlalchemy.orm import aliased @@ -19,6 +20,22 @@ def pending_urls_without_html_data() -> Select: where(URLHTMLContent.id == None). where(URL.outcome == URLStatus.PENDING.value)) + + + @staticmethod + def exclude_urls_with_extant_model( + statement: Select, + model: Any + ): + return (statement.where( + ~exists( + select(model.id). + where( + model.url_id == URL.id + ) + ) + )) + @staticmethod def exclude_urls_with_select_metadata( statement: Select, @@ -64,11 +81,9 @@ def exclude_urls_with_agency_suggestions( ): # Aliases for clarity AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - ConfirmedAgency = aliased(ConfirmedUrlAgency) statement = (statement .where(~exists().where(AutomatedSuggestion.url_id == URL.id)) # Exclude if automated suggestions exist - .where(~exists().where(ConfirmedAgency.url_id == URL.id)) ) # Exclude if confirmed agencies exist return statement diff --git a/collector_db/models.py b/collector_db/models.py index 8e3d2a7d..51fc4a2a 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -92,8 +92,7 @@ class URL(Base): postgresql.ENUM( 'pending', 'submitted', - 'human_labeling', - 'rejected', + 'validated', 'duplicate', 'error', name='url_status' @@ -116,63 +115,19 @@ class URL(Base): secondary="link_task_urls", back_populates="urls", ) - agency = relationship("Agency", back_populates="urls") - automated_agency_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="url") - user_agency_suggestions = relationship("UserUrlAgencySuggestion", back_populates="url") - auto_record_type_suggestions = relationship("AutoRecordTypeSuggestion", back_populates="url") - user_record_type_suggestions = relationship("UserRecordTypeSuggestion", back_populates="url") - auto_relevant_suggestions = relationship("AutoRelevantSuggestion", back_populates="url") - user_relevant_suggestions = relationship("UserRelevantSuggestion", back_populates="url") - -# # URL Metadata table definition -# class URLMetadata(Base): -# __tablename__ = 'url_metadata' -# __table_args__ = (UniqueConstraint( -# "url_id", -# "attribute", -# name="uq_url_id_attribute"), -# ) -# -# id = Column(Integer, primary_key=True, autoincrement=True) -# url_id = Column(Integer, ForeignKey('urls.id', name='url_metadata_url_id_fkey'), nullable=False) -# attribute = Column( -# PGEnum('Record Type', 'Agency', 'Relevant', name='url_attribute'), -# nullable=False) -# value = Column(Text, nullable=False) -# validation_status = Column( -# PGEnum('Pending Validation', 'Validated', name='metadata_validation_status'), -# nullable=False) -# validation_source = Column( -# PGEnum('Machine Learning', 'Label Studio', 'Manual', name='validation_source'), -# nullable=False -# ) -# notes = Column(Text, nullable=True) -# -# -# # Timestamps -# created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) -# updated_at = Column(TIMESTAMP, nullable=False, server_default=func.now(), onupdate=func.now()) -# -# # Relationships -# url = relationship("URL", back_populates="url_metadata") -# annotations = relationship("MetadataAnnotation", back_populates="url_metadata") - -# class MetadataAnnotation(Base): -# __tablename__ = 'metadata_annotations' -# __table_args__ = (UniqueConstraint( -# "user_id", -# "metadata_id", -# name="metadata_annotations_uq_user_id_metadata_id"), -# ) -# -# id = Column(Integer, primary_key=True, autoincrement=True) -# user_id = Column(Integer, nullable=False) -# metadata_id = Column(Integer, ForeignKey('url_metadata.id'), nullable=False) -# value = Column(Text, nullable=False) -# created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) -# -# # Relationships -# url_metadata = relationship("URLMetadata", back_populates="annotations") + agency = relationship("Agency", uselist=False, back_populates="urls") + automated_agency_suggestions = relationship( + "AutomatedUrlAgencySuggestion", back_populates="url") + user_agency_suggestions = relationship( + "UserUrlAgencySuggestion", back_populates="url") + auto_record_type_suggestion = relationship( + "AutoRecordTypeSuggestion", uselist=False, back_populates="url") + user_record_type_suggestions = relationship( + "UserRecordTypeSuggestion", back_populates="url") + auto_relevant_suggestion = relationship( + "AutoRelevantSuggestion", uselist=False, back_populates="url") + user_relevant_suggestions = relationship( + "UserRelevantSuggestion", back_populates="url") class RootURL(Base): __tablename__ = 'root_url_cache' @@ -409,7 +364,7 @@ class AutoRelevantSuggestion(Base): # Relationships - url = relationship("URL", back_populates="auto_relevant_suggestions") + url = relationship("URL", back_populates="auto_relevant_suggestion") class AutoRecordTypeSuggestion(Base): @@ -427,7 +382,7 @@ class AutoRecordTypeSuggestion(Base): # Relationships - url = relationship("URL", back_populates="auto_record_type_suggestions") + url = relationship("URL", back_populates="auto_record_type_suggestion") class UserRelevantSuggestion(Base): __tablename__ = "user_relevant_suggestions" diff --git a/collector_manager/enums.py b/collector_manager/enums.py index b4289488..e90ee7db 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -12,8 +12,6 @@ class CollectorType(Enum): class URLStatus(Enum): PENDING = "pending" SUBMITTED = "submitted" - HUMAN_LABELING = "human_labeling" - REJECTED = "rejected" - DUPLICATE = "duplicate" + VALIDATED = "validated" ERROR = "error" - APPROVED = "approved" + DUPLICATE = "duplicate" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index a4cea685..a576082f 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -8,6 +8,7 @@ from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo from collector_db.enums import TaskType, URLMetadataAttributeType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo @@ -131,50 +132,6 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): await self.adb_client.update_task_status(task_id=run_info.task_id, status=BatchStatus.ERROR) await self.adb_client.add_task_error(task_id=run_info.task_id, error=run_info.message) - async def convert_to_annotation_request_info(self, url_info: URLAnnotationInfo) -> AnnotationRequestInfo: - response_html_info = convert_to_response_html_info( - html_content_infos=url_info.html_infos - ) - - return AnnotationRequestInfo( - url=url_info.url, - metadata_id=url_info.metadata_id, - html_info=response_html_info, - suggested_value=url_info.suggested_value - ) - - async def get_next_url_for_annotation(self, user_id: int, metadata_type: URLMetadataAttributeType) -> GetNextURLForAnnotationResponse: - response = GetNextURLForAnnotationResponse() - ua_info: URLAnnotationInfo = await self.adb_client.get_next_url_for_annotation( - user_id=user_id, - metadata_type=metadata_type - ) - if ua_info is None: - return response - # Format result - result = await self.convert_to_annotation_request_info(url_info=ua_info) - response.next_annotation = result - return response - - async def submit_and_get_next_url_for_annotation( - self, - user_id: int, - metadata_id: int, - annotation: str, - metadata_type: URLMetadataAttributeType - ) -> GetNextURLForAnnotationResponse: - await self.submit_url_annotation( - user_id=user_id, - metadata_id=metadata_id, - annotation=annotation, - metadata_type=metadata_type - ) - result = await self.get_next_url_for_annotation( - user_id=user_id, - metadata_type=metadata_type - ) - return result - async def submit_url_relevance_annotation( self, user_id: int, @@ -205,24 +162,16 @@ async def submit_url_record_type_annotation( url_id: int, record_type: RecordType ): - return await self.adb_client.add_user_record_type_suggestion( + await self.adb_client.add_user_record_type_suggestion( user_id=user_id, url_id=url_id, record_type=record_type ) + next_annotation = await self.adb_client.get_next_url_for_record_type_annotation(user_id=user_id) + return GetNextRecordTypeAnnotationResponseOuterInfo( + next_annotation=next_annotation + ) - async def submit_url_annotation( - self, - user_id: int, - metadata_id: int, - annotation: str, - metadata_type: URLMetadataAttributeType - ) -> GetNextURLForAnnotationResponse: - await self.adb_client.add_metadata_annotation( - user_id=user_id, - metadata_id=metadata_id, - annotation=annotation) - return await self.get_next_url_for_annotation(user_id=user_id, metadata_type=metadata_type) async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: return await self.adb_client.get_urls(page=page, errors=errors) diff --git a/core/DTOs/FinalReviewApprovalInfo.py b/core/DTOs/FinalReviewApprovalInfo.py index 96af2f87..210a07e3 100644 --- a/core/DTOs/FinalReviewApprovalInfo.py +++ b/core/DTOs/FinalReviewApprovalInfo.py @@ -9,12 +9,12 @@ class FinalReviewApprovalInfo(BaseModel): url_id: int = Field( title="The id of the URL." ) - record_type: Optional[RecordType] = Field( + record_type: RecordType = Field( title="The final record type of the URL." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None ) - relevant: Optional[bool] = Field( + relevant: bool = Field( title="Final determination on whether the URL is relevant." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None diff --git a/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py b/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py new file mode 100644 index 00000000..783b5516 --- /dev/null +++ b/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py @@ -0,0 +1,22 @@ +from typing import Optional + +from pydantic import Field, BaseModel + +from collector_db.DTOs.URLMapping import URLMapping +from core.enums import RecordType +from html_tag_collector.DataClassTags import ResponseHTMLInfo + + +class GetNextRecordTypeAnnotationResponseInfo(BaseModel): + url_info: URLMapping = Field( + title="Information about the URL" + ) + suggested_record_type: Optional[RecordType] = Field( + title="Whether the auto-labeler identified the URL as relevant or not" + ) + html_info: ResponseHTMLInfo = Field( + title="HTML information about the URL" + ) + +class GetNextRecordTypeAnnotationResponseOuterInfo(BaseModel): + next_annotation: Optional[GetNextRecordTypeAnnotationResponseInfo] diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index 8a7077a1..fad414af 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -2,7 +2,6 @@ from pydantic import BaseModel, Field -from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo from core.enums import RecordType from html_tag_collector.DataClassTags import ResponseHTMLInfo diff --git a/core/DTOs/GetURLsResponseInfo.py b/core/DTOs/GetURLsResponseInfo.py index 796b6494..162e92b5 100644 --- a/core/DTOs/GetURLsResponseInfo.py +++ b/core/DTOs/GetURLsResponseInfo.py @@ -29,7 +29,6 @@ class GetURLsResponseInnerInfo(BaseModel): updated_at: datetime.datetime created_at: datetime.datetime errors: list[GetURLsResponseErrorInfo] - metadata: list[GetURLsResponseMetadataInfo] class GetURLsResponseInfo(BaseModel): urls: list[GetURLsResponseInnerInfo] diff --git a/core/classes/URLRecordTypeTaskOperator.py b/core/classes/URLRecordTypeTaskOperator.py index 6287bcae..3f94811f 100644 --- a/core/classes/URLRecordTypeTaskOperator.py +++ b/core/classes/URLRecordTypeTaskOperator.py @@ -1,7 +1,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo -from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo -from collector_db.enums import URLMetadataAttributeType, TaskType, ValidationStatus, ValidationSource +from collector_db.enums import TaskType from core.DTOs.task_data_objects.URLRecordTypeTDO import URLRecordTypeTDO from core.classes.TaskOperatorBase import TaskOperatorBase from core.enums import RecordType @@ -23,14 +22,10 @@ def task_type(self): return TaskType.RECORD_TYPE async def meets_task_prerequisites(self): - return await self.adb_client.has_pending_urls_with_html_data_and_without_metadata_type( - without_metadata_type=URLMetadataAttributeType.RECORD_TYPE - ) + return await self.adb_client.has_urls_with_html_data_and_without_auto_record_type_suggestion() async def get_tdos(self) -> list[URLRecordTypeTDO]: - urls_with_html = await self.adb_client.get_urls_with_html_data_and_without_metadata_type( - without_metadata_type=URLMetadataAttributeType.RECORD_TYPE - ) + urls_with_html = await self.adb_client.get_urls_with_html_data_and_without_auto_record_type_suggestion() tdos = [URLRecordTypeTDO(url_with_html=url_with_html) for url_with_html in urls_with_html] return tdos @@ -58,18 +53,12 @@ async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): await self.adb_client.add_url_error_infos(error_infos) async def put_results_into_database(self, tdos: list[URLRecordTypeTDO]): - url_metadatas = [] + suggestions = [] for tdo in tdos: - url_metadata = URLMetadataInfo( - url_id=tdo.url_with_html.url_id, - attribute=URLMetadataAttributeType.RECORD_TYPE, - value=str(tdo.record_type.value), - validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING, - notes=self.classifier.model_name - ) - url_metadatas.append(url_metadata) - await self.adb_client.add_url_metadatas(url_metadatas) + url_id = tdo.url_with_html.url_id + record_type = tdo.record_type + suggestions.append((url_id, record_type)) + await self.adb_client.add_auto_record_type_suggestions(suggestions) async def separate_success_and_error_subsets(self, tdos: list[URLRecordTypeTDO]): success_subset = [tdo for tdo in tdos if not tdo.is_errored()] diff --git a/core/classes/URLRelevanceHuggingfaceTaskOperator.py b/core/classes/URLRelevanceHuggingfaceTaskOperator.py index 2d54a856..e6ebdc3d 100644 --- a/core/classes/URLRelevanceHuggingfaceTaskOperator.py +++ b/core/classes/URLRelevanceHuggingfaceTaskOperator.py @@ -22,14 +22,12 @@ def task_type(self): return TaskType.RELEVANCY async def meets_task_prerequisites(self): - return await self.adb_client.has_pending_urls_with_html_data_and_without_metadata_type() + return await self.adb_client.has_urls_with_html_data_and_without_auto_relevant_suggestion() async def inner_task_logic(self): # Get pending urls from Source Collector # with HTML data and without Relevancy Metadata - tdos = await self.get_pending_url_info( - without_metadata_attribute=URLMetadataAttributeType.RELEVANT - ) + tdos = await self.get_pending_url_info() url_ids = [tdo.url_with_html.url_id for tdo in tdos] await self.link_urls_to_task(url_ids=url_ids) # Pipe into Huggingface @@ -39,17 +37,13 @@ async def inner_task_logic(self): await self.put_results_into_database(tdos) async def put_results_into_database(self, tdos): - url_metadatas = [] + suggestions: list[tuple[int, bool]] = [] for tdo in tdos: - url_metadata = URLMetadataInfo( - url_id=tdo.url_with_html.url_id, - attribute=URLMetadataAttributeType.RELEVANT, - value=str(tdo.relevant), - validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING - ) - url_metadatas.append(url_metadata) - await self.adb_client.add_url_metadatas(url_metadatas) + url_id = tdo.url_with_html.url_id + relevant = tdo.relevant + suggestions.append((url_id, relevant)) + + await self.adb_client.add_auto_relevance_suggestions(suggestions) async def add_huggingface_relevancy(self, tdos: list[URLRelevanceHuggingfaceTDO]): urls_with_html = [tdo.url_with_html for tdo in tdos] @@ -59,12 +53,9 @@ async def add_huggingface_relevancy(self, tdos: list[URLRelevanceHuggingfaceTDO] async def get_pending_url_info( self, - without_metadata_attribute: URLMetadataAttributeType ) -> list[URLRelevanceHuggingfaceTDO]: tdos = [] - pending_urls: list[URLWithHTML] = await self.adb_client.get_urls_with_html_data_and_without_metadata_type( - without_metadata_type=without_metadata_attribute - ) + pending_urls: list[URLWithHTML] = await self.adb_client.get_urls_with_html_data_and_without_auto_relevant_suggestion() for url_with_html in pending_urls: tdo = URLRelevanceHuggingfaceTDO( url_with_html=url_with_html diff --git a/local_database/DataDumper/dump.sh b/local_database/DataDumper/dump.sh index fd63c65f..fb514157 100644 --- a/local_database/DataDumper/dump.sh +++ b/local_database/DataDumper/dump.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -e +#set -e # Variables (customize these or pass them as environment variables) DB_HOST=${DUMP_HOST:-"postgres_container"} DB_USER=${DUMP_USER:-"your_user"} diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index b33051b5..5288496b 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -15,7 +15,7 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_manager.enums import CollectorType, URLStatus from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo -from core.enums import BatchStatus, SuggestionType +from core.enums import BatchStatus, SuggestionType, RecordType from tests.helpers.simple_test_data_functions import generate_test_urls @@ -85,6 +85,40 @@ async def auto_relevant_suggestions(self, url_id: int, relevant: bool = True): relevant=relevant ) + async def user_relevant_suggestion( + self, + url_id: int, + user_id: Optional[int] = None, + relevant: bool = True + ): + if user_id is None: + user_id = randint(1, 99999999) + await self.adb_client.add_user_relevant_suggestion( + url_id=url_id, + user_id=user_id, + relevant=relevant + ) + + async def user_record_type_suggestion( + self, + url_id: int, + record_type: RecordType, + user_id: Optional[int] = None, + ): + if user_id is None: + user_id = randint(1, 99999999) + await self.adb_client.add_user_record_type_suggestion( + url_id=url_id, + user_id=user_id, + record_type=record_type + ) + + async def auto_record_type_suggestions(self, url_id: int, record_type: RecordType): + await self.adb_client.add_auto_record_type_suggestion( + url_id=url_id, + record_type=record_type + ) + async def auto_suggestions( self, @@ -192,28 +226,6 @@ async def html_data(self, url_ids: list[int]): ) await self.adb_client.add_html_content_infos(html_content_infos) - async def metadata( - self, - url_ids: list[int], - attribute: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT, - value: str = "False", - validation_status: ValidationStatus = ValidationStatus.PENDING_VALIDATION, - validation_source: ValidationSource = ValidationSource.MACHINE_LEARNING - ) -> list[int]: - metadata_ids = [] - for url_id in url_ids: - metadata_id = await self.adb_client.add_url_metadata( - URLMetadataInfo( - url_id=url_id, - attribute=attribute, - value=value, - validation_status=validation_status, - validation_source=validation_source, - ) - ) - metadata_ids.append(metadata_id) - return metadata_ids - async def error_info( self, url_ids: list[int], @@ -231,19 +243,6 @@ async def error_info( error_infos.append(url_error_info) await self.adb_client.add_url_error_infos(error_infos) - async def user_annotation( - self, - metadata_id: int, - user_id: Optional[int] = None, - annotation: str = "test annotation" - ): - if user_id is None: - user_id = randint(1, 99999999) - await self.adb_client.add_metadata_annotation( - user_id=user_id, - metadata_id=metadata_id, - annotation=annotation - ) async def agency_auto_suggestions( self, diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index b45fb14a..44415090 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -18,14 +18,7 @@ async def setup_for_get_next_url_for_final_review( url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] await db_data_creator.html_data([url_mapping.url_id]) - async def add_metadata_annotation(count: int, value: str, metadata_id: int): - for i in range(count): - await db_data_creator.user_annotation( - metadata_id=metadata_id, - annotation=value - ) - - async def add_user_suggestion(count: int): + async def add_agency_suggestion(count: int): agency_id = await db_data_creator.agency() for i in range(count): await db_data_creator.agency_user_suggestions( @@ -33,34 +26,41 @@ async def add_user_suggestion(count: int): agency_id=agency_id ) - relevant_metadata_ids = await db_data_creator.metadata( - url_ids=[url_mapping.url_id], - attribute=URLMetadataAttributeType.RELEVANT, - value="True", - validation_source=ValidationSource.MACHINE_LEARNING, - validation_status=ValidationStatus.PENDING_VALIDATION + async def add_record_type_suggestion(count: int, record_type: RecordType): + for i in range(count): + await db_data_creator.user_record_type_suggestion( + url_id=url_mapping.url_id, + record_type=record_type + ) + + async def add_relevant_suggestion(count: int, relevant: bool): + for i in range(count): + await db_data_creator.user_relevant_suggestion( + url_id=url_mapping.url_id, + relevant=relevant + ) + + await db_data_creator.auto_relevant_suggestions( + url_id=url_mapping.url_id, + relevant=True ) - relevant_metadata_id = relevant_metadata_ids[0] - record_type_metadata_ids = await db_data_creator.metadata( - url_ids=[url_mapping.url_id], - attribute=URLMetadataAttributeType.RECORD_TYPE, - value=RecordType.ARREST_RECORDS.value, - validation_source=ValidationSource.MACHINE_LEARNING, - validation_status=ValidationStatus.PENDING_VALIDATION + + await db_data_creator.auto_record_type_suggestions( + url_id=url_mapping.url_id, + record_type=RecordType.ARREST_RECORDS ) - record_type_metadata_id = record_type_metadata_ids[0] if include_user_annotations: - await add_metadata_annotation(annotation_count, "True", relevant_metadata_id) - await add_metadata_annotation(1, "False", relevant_metadata_id) - await add_metadata_annotation(3, RecordType.ARREST_RECORDS.value, record_type_metadata_id) - await add_metadata_annotation(2, RecordType.DISPATCH_RECORDINGS.value, record_type_metadata_id) - await add_metadata_annotation(1, RecordType.ACCIDENT_REPORTS.value, record_type_metadata_id) + await add_relevant_suggestion(annotation_count, True) + await add_relevant_suggestion(1, False) + await add_record_type_suggestion(3, RecordType.ARREST_RECORDS) + await add_record_type_suggestion(2, RecordType.DISPATCH_RECORDINGS) + await add_record_type_suggestion(1, RecordType.ACCIDENT_REPORTS) if include_user_annotations: # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 for i in range(annotation_count): - await add_user_suggestion(i + 1) + await add_agency_suggestion(i + 1) return url_mapping diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 1673ca42..7018d5aa 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -1,4 +1,3 @@ -import polars as pl import pytest from collector_db.AsyncDatabaseClient import AsyncDatabaseClient diff --git a/tests/test_alembic/test_revisions.py b/tests/test_alembic/test_revisions.py index 343890df..9bc287d1 100644 --- a/tests/test_alembic/test_revisions.py +++ b/tests/test_alembic/test_revisions.py @@ -372,4 +372,9 @@ def test_revise_agency_suggestions(alembic_runner): assert not alembic_runner.tables_exist(tables_to_check) alembic_runner.upgrade("d7eb670edaf0") assert not alembic_runner.table_exists("url_agency_suggestions") - assert alembic_runner.tables_exist(tables_to_check) \ No newline at end of file + assert alembic_runner.tables_exist(tables_to_check) + +def test_full_upgrade_downgrade(alembic_runner): + # Both should run without error + alembic_runner.upgrade("head") + alembic_runner.downgrade("base") \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 8a7cd487..064e9912 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,6 +12,7 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -180,22 +181,22 @@ def get_next_relevance_annotation(self) -> GetNextRelevanceAnnotationResponseOut ) return GetNextRelevanceAnnotationResponseOuterInfo(**data) - def get_next_record_type_annotation(self) -> GetNextURLForAnnotationResponse: + def get_next_record_type_annotation(self) -> GetNextRecordTypeAnnotationResponseOuterInfo: data = self.get( url=f"/annotate/record-type" ) - return GetNextURLForAnnotationResponse(**data) + return GetNextRecordTypeAnnotationResponseOuterInfo(**data) def post_record_type_annotation_and_get_next( self, - metadata_id: int, + url_id: int, record_type_annotation_post_info: RecordTypeAnnotationPostInfo - ) -> GetNextURLForAnnotationResponse: + ) -> GetNextRecordTypeAnnotationResponseOuterInfo: data = self.post( - url=f"/annotate/record-type/{metadata_id}", + url=f"/annotate/record-type/{url_id}", json=record_type_annotation_post_info.model_dump(mode='json') ) - return GetNextURLForAnnotationResponse(**data) + return GetNextRecordTypeAnnotationResponseOuterInfo(**data) def post_relevance_annotation_and_get_next( self, diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index b90ad1cc..1530dcb1 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -5,7 +5,8 @@ from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.URLMapping import URLMapping from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import UserUrlAgencySuggestion, UserRelevantSuggestion +from collector_db.models import UserUrlAgencySuggestion, UserRelevantSuggestion, UserRecordTypeSuggestion +from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse @@ -29,83 +30,6 @@ def check_html_info_not_empty( assert html_info.description != "" assert html_info.title != "" -async def run_annotation_test( - api_test_helper, - submit_and_get_next_function: callable, - get_next_function: callable, - post_info: Any, - metadata_attribute: URLMetadataAttributeType, - expected_metadata_value: str -): - ath = api_test_helper - - # Create batch with status `in-process` and strategy `example` - batch_id = ath.db_data_creator.batch() - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - kwargs = { - "attribute": metadata_attribute, - "validation_status": ValidationStatus.PENDING_VALIDATION, - "validation_source": ValidationSource.MACHINE_LEARNING - } - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.metadata( - url_ids=[url_1.url_id], - **kwargs - ) - # and `Relevancy` attribute with value `False` to 2nd other URL - await ath.db_data_creator.metadata( - url_ids=[url_2.url_id], - **kwargs - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - # Call `GET` `/annotate/url` and receive next URL - request_info_1: GetNextURLForAnnotationResponse = get_next_function() - inner_info_1 = request_info_1.next_annotation - - # Validate presence of HTML data in `html` field - assert inner_info_1.html_info.description != "" - assert inner_info_1.html_info.title != "" - - # Validate that the correct metadata value is returned - assert inner_info_1.suggested_value == "False" - - # Call `POST` `/annotate/url` with finished annotation, and receive next URL - request_info_2 = submit_and_get_next_function( - inner_info_1.metadata_id, - post_info - ) - inner_info_2 = request_info_2.next_annotation - # Confirm 2nd URL is distinct from 1st - assert inner_info_1.url != inner_info_2.url - - # Validate presence of appropriate HTML data in `html` field - assert inner_info_2.html_info.description != "" - assert inner_info_2.html_info.title != "" - - # Validation annotation is present in database - results = await api_test_helper.db_data_creator.adb_client.get_annotations_for_metadata_id( - metadata_id=inner_info_1.metadata_id - ) - assert len(results) == 1 - assert results[0].user_id == MOCK_USER_ID - assert results[0].value == expected_metadata_value - - # Submit this one in turn, and no subsequent annotation info should be returned - request_info_3 = submit_and_get_next_function( - inner_info_2.metadata_id, - post_info - ) - - assert request_info_3.next_annotation is None - @pytest.mark.asyncio async def test_annotate_relevancy(api_test_helper): ath = api_test_helper @@ -132,7 +56,7 @@ async def test_annotate_relevancy(api_test_helper): # Add HTML data to both await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - # Call `GET` `/annotate/url` and receive next URL + # Call `GET` `/annotate/relevance` and receive next URL request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() inner_info_1 = request_info_1.next_annotation @@ -201,17 +125,94 @@ async def test_annotate_relevancy(api_test_helper): @pytest.mark.asyncio async def test_annotate_record_type(api_test_helper): - await run_annotation_test( - api_test_helper=api_test_helper, - submit_and_get_next_function=api_test_helper.request_validator.post_record_type_annotation_and_get_next, - get_next_function=api_test_helper.request_validator.get_next_record_type_annotation, - post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ACCIDENT_REPORTS - ), - metadata_attribute=URLMetadataAttributeType.RECORD_TYPE, - expected_metadata_value=RecordType.ACCIDENT_REPORTS.value + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct record type is returned + assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS + + # Annotate with value 'Personnel Records' and get next URL + request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.PERSONNEL_RECORDS + ) ) + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match(inner_info_2.url_info, url_2) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value + + # If user submits annotation for same URL, the URL should be overwritten + + request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.BOOKING_REPORTS + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert result.record_type == RecordType.BOOKING_REPORTS.value + + @pytest.mark.asyncio async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): """ diff --git a/tests/test_automated/integration/api/test_url.py b/tests/test_automated/integration/api/test_url.py index 9ccc7e5f..fccd8e4e 100644 --- a/tests/test_automated/integration/api/test_url.py +++ b/tests/test_automated/integration/api/test_url.py @@ -21,9 +21,6 @@ async def test_get_urls(api_test_helper): url_id_1st = iui.url_mappings[0].url_id - # Add metadata - await db_data_creator.metadata(url_ids=[url_id_1st]) - # Get the latter 2 urls url_ids = [iui.url_mappings[1].url_id, iui.url_mappings[2].url_id] @@ -35,12 +32,10 @@ async def test_get_urls(api_test_helper): assert data.count == 3 assert len(data.urls) == 3 assert data.urls[0].url == iui.url_mappings[0].url - assert len(data.urls[0].metadata) == 1 for i in range(1, 3): assert data.urls[i].url == iui.url_mappings[i].url assert len(data.urls[i].errors) == 1 - assert len(data.urls[i].metadata) == 0 # Retrieve data again with errors only data: GetURLsResponseInfo = api_test_helper.request_validator.get_urls(errors=True) diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 9c31c9cf..2b2fcbca 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -235,45 +235,6 @@ def test_url(db_data_creator: DBDataCreator): table_tester.run_column_tests() -def test_url_metadata(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) - - - table_tester = TableTester( - table_name="url_metadata", - columns=[ - ColumnTester( - column_name="url_id", - type_=sa.Integer, - allowed_values=[iui.url_mappings[0].url_id] - ), - ColumnTester( - column_name="attribute", - type_=postgresql.ENUM, - allowed_values=["Record Type", "Agency", "Relevant"] - ), - ColumnTester( - column_name="value", - type_=sa.Text, - allowed_values=["Text"] - ), - ColumnTester( - column_name="validation_status", - type_=postgresql.ENUM, - allowed_values=["Pending Validation", "Validated"] - ), - ColumnTester( - column_name="validation_source", - type_=postgresql.ENUM, - allowed_values=["Machine Learning", "Label Studio", "Manual"] - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - def test_html_content(db_data_creator: DBDataCreator): batch_id = db_data_creator.batch() iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 8d970d48..9ded2a28 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -9,7 +9,7 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import ConfirmedUrlAgency, MetadataAnnotation, URL +from collector_db.models import URL from collector_manager.enums import URLStatus from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator @@ -111,25 +111,7 @@ def test_delete_url_updated_at(db_data_creator: DBDataCreator): url = db_client.get_urls_by_batch(batch_id=batch_id, page=1)[0] assert url.updated_at > old_updated_at -@pytest.mark.asyncio -async def test_get_url_metadata(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - url_id = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0].url_id - - adb_client = AsyncDatabaseClient() - - await adb_client.add_url_metadata( - url_metadata_info=URLMetadataInfo( - url_id=url_id, - attribute=URLMetadataAttributeType.RELEVANT, - value="False", - validation_status=ValidationStatus.PENDING_VALIDATION, - validation_source=ValidationSource.MACHINE_LEARNING, - ) - ) - metadata = await adb_client.get_url_metadata_by_status(url_status=URLStatus.PENDING) - print(metadata) @pytest.mark.asyncio async def test_add_url_error_info(db_data_creator: DBDataCreator): @@ -162,40 +144,6 @@ async def test_add_url_error_info(db_data_creator: DBDataCreator): assert result.url_id in url_ids assert result.error == "test error" -@pytest.mark.asyncio -async def test_get_urls_with_html_data_and_no_relevancy_metadata( - db_data_creator: DBDataCreator, -): - batch_id = db_data_creator.batch() - url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings - url_ids = [url_info.url_id for url_info in url_mappings] - await db_data_creator.html_data(url_ids) - await db_data_creator.metadata([url_ids[0]]) - results = await db_data_creator.adb_client.get_urls_with_html_data_and_without_metadata_type( - without_metadata_type=URLMetadataAttributeType.RELEVANT - ) - - permitted_url_ids = [url_id for url_id in url_ids if url_id != url_ids[0]] - assert len(results) == 2 - for result in results: - assert result.url_id in permitted_url_ids - assert len(result.html_infos) == 2 - -@pytest.mark.asyncio -async def test_get_urls_with_metadata(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings - url_ids = [url_info.url_id for url_info in url_mappings] - await db_data_creator.metadata([url_ids[0]]) - # Neither of these two URLs should be picked up - await db_data_creator.metadata([url_ids[1]], attribute=URLMetadataAttributeType.RECORD_TYPE) - await db_data_creator.metadata([url_ids[2]], validation_status=ValidationStatus.VALIDATED) - results = await db_data_creator.adb_client.get_urls_with_metadata( - attribute=URLMetadataAttributeType.RELEVANT, - validation_status=ValidationStatus.PENDING_VALIDATION - ) - assert len(results) == 1 - @pytest.mark.asyncio async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreator): @@ -377,45 +325,19 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): adb_client = db_data_creator.adb_client # Approve URL. Only URL should be affected. No other properties should be changed. - await adb_client.approve_url(url_mapping.url_id) - - # Confirm same agency id is listed as confirmed - confirmed_agencies = await adb_client.get_all( - ConfirmedUrlAgency + await adb_client.approve_url( + url_mapping.url_id, + record_type=RecordType.ARREST_RECORDS, + relevant=True ) - assert len(confirmed_agencies) == 1 - confirmed_agency = confirmed_agencies[0] - assert confirmed_agency.url_id == url_mapping.url_id - assert confirmed_agency.agency_id == agency_id - - # Confirm two metadata entries - metadatas = await adb_client.get_all( - MetadataAnnotation - ) - assert len(metadatas) == 2 - record_type_metadata = None - relevant_metadata = None - for metadata in metadatas: - if metadata.attribute == URLMetadataAttributeType.RECORD_TYPE.value: - record_type_metadata = metadata - elif metadata.attribute == URLMetadataAttributeType.RELEVANT.value: - relevant_metadata = metadata - - # - One is Record Type, with record type as ARREST_RECORDS and set as approved - - assert record_type_metadata.value == RecordType.ARREST_RECORDS.value - assert record_type_metadata.validation_status == ValidationStatus.VALIDATED.value - - # - One is Relevant, and is set as TRUE and approved - assert relevant_metadata.value == "True" - assert relevant_metadata.validation_status == ValidationStatus.VALIDATED.value - - # Confirm URL - urls = await adb_client.get_all( - URL - ) + # Confirm same agency id is listed as confirmed + urls = await adb_client.get_all(URL) assert len(urls) == 1 url = urls[0] - assert url.status == URLStatus.APPROVED + assert url.id == url_mapping.url_id + assert url.agency_id == agency_id + assert url.record_type == RecordType.ARREST_RECORDS.value + assert url.relevant == True + assert url.outcome == URLStatus.VALIDATED.value diff --git a/tests/test_automated/integration/security_manager/test_security_manager.py b/tests/test_automated/integration/security_manager/test_security_manager.py index 010c3bf2..3dc676ad 100644 --- a/tests/test_automated/integration/security_manager/test_security_manager.py +++ b/tests/test_automated/integration/security_manager/test_security_manager.py @@ -17,7 +17,10 @@ def mock_get_secret_key(mocker): SECRET_KEY = "test_secret_key" VALID_TOKEN = "valid_token" INVALID_TOKEN = "invalid_token" -FAKE_PAYLOAD = {"sub": 1, "permissions": [Permissions.SOURCE_COLLECTOR.value]} +FAKE_PAYLOAD = { + "sub": 1, + "permissions": [Permissions.SOURCE_COLLECTOR.value] +} def test_api_with_valid_token(mock_get_secret_key): diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index c8df809c..1c1289e7 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -7,7 +7,7 @@ from aiohttp import ClientSession from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse -from collector_db.models import ConfirmedUrlAgency, Agency, AutomatedUrlAgencySuggestion +from collector_db.models import Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo @@ -155,7 +155,7 @@ async def mock_run_subtask( # Check confirmed and auto suggestions adb_client = db_data_creator.adb_client - confirmed_suggestions = await adb_client.get_all(ConfirmedUrlAgency) + confirmed_suggestions = await adb_client.get_urls_with_confirmed_agencies() assert len(confirmed_suggestions) == 2 agencies = await adb_client.get_all(Agency) diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py index cf4c8e0e..c56acec1 100644 --- a/tests/test_automated/integration/tasks/test_url_record_type_task.py +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -3,12 +3,11 @@ import pytest from collector_db.enums import TaskType -from collector_db.models import URLMetadata +from collector_db.models import AutoRecordTypeSuggestion from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.enums import RecordType, BatchStatus from tests.helpers.DBDataCreator import DBDataCreator -from tests.helpers.assert_functions import assert_database_has_no_tasks from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier @pytest.mark.asyncio @@ -51,8 +50,7 @@ async def test_url_record_type_task(db_data_creator: DBDataCreator): assert task.url_error_count == 1 # Get metadata - metadata_results = await db_data_creator.adb_client.get_all(URLMetadata) - for metadata_row in metadata_results: - assert metadata_row.notes == "test_notes" - assert metadata_row.value == RecordType.ACCIDENT_REPORTS.value + suggestions = await db_data_creator.adb_client.get_all(AutoRecordTypeSuggestion) + for suggestion in suggestions: + assert suggestion.record_type == RecordType.ACCIDENT_REPORTS.value diff --git a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index 188621b7..11ef770a 100644 --- a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -5,7 +5,8 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import ValidationStatus, ValidationSource -from collector_db.models import URLMetadata, Task +from collector_db.models import AutoRelevantSuggestion +from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from tests.helpers.assert_functions import assert_database_has_no_tasks from hugging_face.HuggingFaceInterface import HuggingFaceInterface @@ -39,7 +40,8 @@ def mock_get_url_relevancy( adb_client=AsyncDatabaseClient(), huggingface_interface=mock_hf_interface ) - await task_operator.run_task(1) + meets_task_prerequisites = await task_operator.meets_task_prerequisites() + assert not meets_task_prerequisites await assert_database_has_no_tasks(db_data_creator.adb_client) @@ -47,15 +49,14 @@ def mock_get_url_relevancy( url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings url_ids = [url_info.url_id for url_info in url_mappings] await db_data_creator.html_data(url_ids) - await db_data_creator.metadata([url_ids[0]]) - await task_operator.run_task(1) + run_info: TaskOperatorRunInfo = await task_operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS - results = await db_data_creator.adb_client.get_all(URLMetadata) + + results = await db_data_creator.adb_client.get_all(AutoRelevantSuggestion) assert len(results) == 3 for result in results: assert result.url_id in url_ids - assert result.value in ['True', 'False'] - assert result.validation_status == ValidationStatus.PENDING_VALIDATION.value - assert result.validation_source == ValidationSource.MACHINE_LEARNING.value \ No newline at end of file + assert result.relevant == num_to_bool(result.url_id % 2) diff --git a/util/alembic_helpers.py b/util/alembic_helpers.py new file mode 100644 index 00000000..d2120634 --- /dev/null +++ b/util/alembic_helpers.py @@ -0,0 +1,38 @@ +from alembic import op +import sqlalchemy as sa + +def switch_enum_type( + table_name, + column_name, + enum_name, + new_enum_values, + drop_old_enum=True +): + """ + Switches an ENUM type in a PostgreSQL column by: + 1. Renaming the old enum type. + 2. Creating the new enum type with the same name. + 3. Updating the column to use the new enum type. + 4. Dropping the old enum type. + + :param table_name: Name of the table containing the ENUM column. + :param column_name: Name of the column using the ENUM type. + :param enum_name: Name of the ENUM type in PostgreSQL. + :param new_enum_values: List of new ENUM values. + :param drop_old_enum: Whether to drop the old ENUM type. + """ + + # Rename old enum type + old_enum_temp_name = f"{enum_name}_old" + op.execute(f'ALTER TYPE "{enum_name}" RENAME TO "{old_enum_temp_name}"') + + # Create new enum type with the updated values + new_enum_type = sa.Enum(*new_enum_values, name=enum_name) + new_enum_type.create(op.get_bind()) + + # Alter the column type to use the new enum type + op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + + # Drop the old enum type + if drop_old_enum: + op.execute(f'DROP TYPE "{old_enum_temp_name}"') From 6133f206c57222b5267c59fe9d8ca373c8ec4434 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 25 Feb 2025 12:06:19 -0500 Subject: [PATCH 059/182] build(api): add approve source endpoint and overhaul metadata Create `/review/approve-source` endpoint Overhaul annotation backend for better maintainability BREAKING CHANGE: Annotations no longer return metadata ids, but url ids. Approval or suggested annotations must now include url ids instead of metadata ids. --- .../integration/collector_db/test_db_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 9ded2a28..92a44dca 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -234,7 +234,10 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat @pytest.mark.asyncio -async def test_get_next_url_for_final_review_favor_more_annotations(db_data_creator: DBDataCreator): +async def test_get_next_url_for_final_review_favor_more_annotations( + db_data_creator: DBDataCreator, + wipe_database +): """ Test in the case of two URLs with the same number of components annotated, favoring the one with more total annotations """ From 0cbaf8d6c9e3f044fa4b7aa633db9db751685cb5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 25 Feb 2025 12:22:23 -0500 Subject: [PATCH 060/182] build(api): add approve source endpoint and overhaul metadata Create `/review/approve-source` endpoint Overhaul annotation backend for better maintainability BREAKING CHANGE: Annotations no longer return metadata ids, but url ids. Approval or suggested annotations must now include url ids instead of metadata ids. --- collector_db/AsyncDatabaseClient.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index cf7f36cb..81d6dd02 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -5,6 +5,7 @@ from sqlalchemy import select, exists, func, case, desc, Select, not_, and_ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute +from sqlalchemy.sql.functions import coalesce from starlette import status from collector_db.ConfigManager import ConfigManager @@ -1004,7 +1005,7 @@ def count_subquery(model: Type[Base]): sum_of_count_subqueries = ( sum( [ - subquery.c.count + coalesce(subquery.c.count, 0) for subquery in count_subqueries ] ) From 7bcce5c1e574cfcc8be01d0c71fbf0be949fb748 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 26 Feb 2025 11:37:50 -0500 Subject: [PATCH 061/182] build(api): add approve source endpoint and overhaul metadata Create `/review/approve-source` endpoint Overhaul annotation backend for better maintainability BREAKING CHANGE: Annotations no longer return metadata ids, but url ids. Approval or suggested annotations must now include url ids instead of metadata ids. --- api/routes/review.py | 18 +++++-- core/DTOs/GetNextURLForFinalReviewResponse.py | 5 ++ .../api/helpers/RequestValidator.py | 17 +++++-- .../integration/api/test_review.py | 47 ++++++++++++++++++- 4 files changed, 80 insertions(+), 7 deletions(-) diff --git a/api/routes/review.py b/api/routes/review.py index f1b7210a..0933d27d 100644 --- a/api/routes/review.py +++ b/api/routes/review.py @@ -2,7 +2,9 @@ from api.dependencies import get_async_core from core.AsyncCore import AsyncCore -from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \ + GetNextURLForFinalReviewOuterResponse from security_manager.SecurityManager import AccessInfo, get_access_info review_router = APIRouter( @@ -15,5 +17,15 @@ async def get_next_source( core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), -) -> GetNextURLForFinalReviewResponse: - return await core.get_next_source_for_review() \ No newline at end of file +) -> GetNextURLForFinalReviewOuterResponse: + next_source = await core.get_next_source_for_review() + return GetNextURLForFinalReviewOuterResponse(next_source=next_source) + +@review_router.post("/approve-source") +async def approve_source( + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), + approval_info: FinalReviewApprovalInfo = FinalReviewApprovalInfo +) -> GetNextURLForFinalReviewOuterResponse: + next_source = await core.approve_and_get_next_source_for_review(approval_info) + return GetNextURLForFinalReviewOuterResponse(next_source=next_source) \ No newline at end of file diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index fad414af..df28040b 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -60,4 +60,9 @@ class GetNextURLForFinalReviewResponse(BaseModel): html_info: ResponseHTMLInfo = Field(title="The HTML content of the URL") annotations: FinalReviewAnnotationInfo = Field( title="The annotations for the URL, from both users and the auto-labeler", + ) + +class GetNextURLForFinalReviewOuterResponse(BaseModel): + next_source: Optional[GetNextURLForFinalReviewResponse] = Field( + title="The next source to be reviewed", ) \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 064e9912..e2c8a479 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -9,6 +9,7 @@ from collector_db.enums import TaskType from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse @@ -18,7 +19,8 @@ from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse -from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \ + GetNextURLForFinalReviewOuterResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo @@ -259,9 +261,18 @@ def get_tasks( ) return GetTasksResponse(**data) - async def review_next_source(self) -> GetNextURLForFinalReviewResponse: + async def review_next_source(self) -> GetNextURLForFinalReviewOuterResponse: data = self.get( url=f"/review/next-source" ) - return GetNextURLForFinalReviewResponse(**data) + return GetNextURLForFinalReviewOuterResponse(**data) + async def approve_and_get_next_source_for_review( + self, + approval_info: FinalReviewApprovalInfo + ) -> GetNextURLForFinalReviewOuterResponse: + data = self.post( + url=f"/review/approve-source", + json=approval_info.model_dump(mode='json') + ) + return GetNextURLForFinalReviewOuterResponse(**data) diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index a69f474a..99af93e9 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -1,5 +1,9 @@ import pytest +from collector_db.models import URL +from collector_manager.enums import URLStatus +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse from core.enums import RecordType from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review @@ -19,7 +23,9 @@ async def test_review_next_source(api_test_helper): count=3 ) - result = await ath.request_validator.review_next_source() + outer_result = await ath.request_validator.review_next_source() + + result = outer_result.next_source assert result.url == url_mapping.url html_info = result.html_info @@ -53,3 +59,42 @@ async def test_review_next_source(api_test_helper): for i in range(3): assert user_agency_suggestions_as_list[i].count == 3 - i +@pytest.mark.asyncio +async def test_approve_and_get_next_source_for_review(api_test_helper): + ath = api_test_helper + db_data_creator = ath.db_data_creator + + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + # Add confirmed agency + agency_id = await db_data_creator.agency_confirmed_suggestion( + url_id=url_mapping.url_id + ) + + result: GetNextURLForFinalReviewOuterResponse = await ath.request_validator.approve_and_get_next_source_for_review( + approval_info=FinalReviewApprovalInfo( + url_id=url_mapping.url_id, + record_type=RecordType.ARREST_RECORDS, + relevant=True, + agency_id=agency_id + ) + ) + + assert result.next_source is None + + adb_client = db_data_creator.adb_client + # Confirm same agency id is listed as confirmed + urls = await adb_client.get_all(URL) + assert len(urls) == 1 + url = urls[0] + assert url.id == url_mapping.url_id + assert url.agency_id == agency_id + assert url.record_type == RecordType.ARREST_RECORDS.value + assert url.relevant == True + assert url.outcome == URLStatus.VALIDATED.value + + From 048b3172dc6abd0d96fbb3b84bd2e997cdff1a93 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 11 Mar 2025 15:55:33 -0400 Subject: [PATCH 062/182] feat(app): add table to record users validating URLs. --- ...c4f56d4_create_approving_user_url_table.py | 34 +++++++++++++++++++ api/routes/review.py | 5 ++- collector_db/AsyncDatabaseClient.py | 12 +++++-- collector_db/models.py | 18 ++++++++++ core/AsyncCore.py | 7 ++-- .../collector_db/test_db_client.py | 10 ++++-- 6 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 alembic/versions/2025_03_11_1539-69f7cc4f56d4_create_approving_user_url_table.py diff --git a/alembic/versions/2025_03_11_1539-69f7cc4f56d4_create_approving_user_url_table.py b/alembic/versions/2025_03_11_1539-69f7cc4f56d4_create_approving_user_url_table.py new file mode 100644 index 00000000..f38d33dc --- /dev/null +++ b/alembic/versions/2025_03_11_1539-69f7cc4f56d4_create_approving_user_url_table.py @@ -0,0 +1,34 @@ +"""Create approving_user_url table + +Revision ID: 69f7cc4f56d4 +Revises: 33421c0590bb +Create Date: 2025-03-11 15:39:27.563567 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '69f7cc4f56d4' +down_revision: Union[str, None] = '33421c0590bb' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'approving_user_url', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('user_id', sa.Integer(), nullable=False), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ), + sa.UniqueConstraint('url_id', name='approving_user_url_uq_user_id_url_id') + ) + + +def downgrade() -> None: + op.drop_table('approving_user_url') diff --git a/api/routes/review.py b/api/routes/review.py index 0933d27d..61dccbbb 100644 --- a/api/routes/review.py +++ b/api/routes/review.py @@ -27,5 +27,8 @@ async def approve_source( access_info: AccessInfo = Depends(get_access_info), approval_info: FinalReviewApprovalInfo = FinalReviewApprovalInfo ) -> GetNextURLForFinalReviewOuterResponse: - next_source = await core.approve_and_get_next_source_for_review(approval_info) + next_source = await core.approve_and_get_next_source_for_review( + approval_info, + access_info=access_info + ) return GetNextURLForFinalReviewOuterResponse(next_source=next_source) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 81d6dd02..82fedc93 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -22,7 +22,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion + UserRecordTypeSuggestion, ApprovingUserURL from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo @@ -1107,7 +1107,8 @@ async def approve_url( url_id: int, record_type: RecordType, relevant: bool, - agency_id: Optional[int] = None + user_id: int, + agency_id: Optional[int] = None, ) -> None: # Get URL @@ -1135,3 +1136,10 @@ async def approve_url( # If it does, do nothing url.outcome = URLStatus.VALIDATED.value + + approving_user_url = ApprovingUserURL( + user_id=user_id, + url_id=url_id + ) + + session.add(approving_user_url) diff --git a/collector_db/models.py b/collector_db/models.py index 51fc4a2a..7990eb65 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -128,6 +128,24 @@ class URL(Base): "AutoRelevantSuggestion", uselist=False, back_populates="url") user_relevant_suggestions = relationship( "UserRelevantSuggestion", back_populates="url") + approving_users = relationship( + "ApprovingUserURL", back_populates="url") + +class ApprovingUserURL(Base): + __tablename__ = 'approving_user_url' + __table_args__ = ( + UniqueConstraint( + "url_id", + name="approving_user_url_uq_user_id_url_id"), + ) + + id = Column(Integer, primary_key=True) + user_id = Column(Integer, nullable=False) + url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + created_at = get_created_at_column() + + # Relationships + url = relationship("URL", back_populates="approving_users") class RootURL(Base): __tablename__ = 'root_url_cache' diff --git a/core/AsyncCore.py b/core/AsyncCore.py index a576082f..4854926e 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -30,6 +30,7 @@ from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier from pdap_api_client.AccessManager import AccessManager from pdap_api_client.PDAPClient import PDAPClient +from security_manager.SecurityManager import AccessInfo from util.helper_functions import get_from_env @@ -213,12 +214,14 @@ async def get_next_source_for_review(self): async def approve_and_get_next_source_for_review( self, - approval_info: FinalReviewApprovalInfo + approval_info: FinalReviewApprovalInfo, + access_info: AccessInfo ): await self.adb_client.approve_url( url_id=approval_info.url_id, record_type=approval_info.record_type, relevant=approval_info.relevant, - agency_id=approval_info.agency_id + agency_id=approval_info.agency_id, + user_id=access_info.user_id ) return await self.get_next_source_for_review() diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 92a44dca..b8ac56f1 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -9,7 +9,7 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import URL +from collector_db.models import URL, ApprovingUserURL from collector_manager.enums import URLStatus from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator @@ -331,7 +331,8 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): await adb_client.approve_url( url_mapping.url_id, record_type=RecordType.ARREST_RECORDS, - relevant=True + relevant=True, + user_id=1 ) # Confirm same agency id is listed as confirmed @@ -344,3 +345,8 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value + approving_user_urls = await adb_client.get_all(ApprovingUserURL) + assert len(approving_user_urls) == 1 + assert approving_user_urls[0].user_id == 1 + assert approving_user_urls[0].url_id == url_mapping.url_id + From 74247a70b71040188021ceea4e7fabff964e19f2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Mar 2025 17:46:23 -0400 Subject: [PATCH 063/182] DRAFT --- ENV.md | 8 +++-- collector_db/enums.py | 1 + core/AsyncCore.py | 30 ++++++++++++++----- .../task_data_objects/SubmitApprovedURLTDO.py | 11 +++++++ core/README.md | 3 +- core/classes/SubmitApprovedURLTaskOperator.py | 28 +++++++++++++++++ pdap_api_client/AccessManager.py | 5 ++-- 7 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 core/DTOs/task_data_objects/SubmitApprovedURLTDO.py create mode 100644 core/classes/SubmitApprovedURLTaskOperator.py diff --git a/ENV.md b/ENV.md index 68359348..92b7de31 100644 --- a/ENV.md +++ b/ENV.md @@ -14,11 +14,13 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | |`POSTGRES_HOST` | The host for the test database | `127.0.0.1` | |`POSTGRES_PORT` | The port for the test database | `5432` | -|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | +|`DS_APP_SECRET_KEY`| The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | |`DEV`| Set to any value to run the application in development mode. | `true` | |`DEEPSEEK_API_KEY`| The API key required for accessing the DeepSeek API. | `abc123` | |`OPENAI_API_KEY`| The API key required for accessing the OpenAI API. | `abc123` | -|`PDAP_EMAIL`| An email address for accessing the PDAP API. | `abc123@test.com` | -|`PDAP_PASSWORD`| A password for accessing the PDAP API. | `abc123` | +|`PDAP_EMAIL`| An email address for accessing the PDAP API.[^1] | `abc123@test.com` | +|`PDAP_PASSWORD`| A password for accessing the PDAP API.[^1] | `abc123` | |`PDAP_API_KEY`| An API key for accessing the PDAP API. | `abc123` | +|`PDAP_API_URL`| The URL for the PDAP API| `https://data-sources-v2.pdap.dev/api`| +[^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions. \ No newline at end of file diff --git a/collector_db/enums.py b/collector_db/enums.py index 2d82e87b..60b3df13 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -37,6 +37,7 @@ class TaskType(PyEnum): RELEVANCY = "Relevancy" RECORD_TYPE = "Record Type" AGENCY_IDENTIFICATION = "Agency Identification" + SUBMIT_APPROVED = "Submit Approved URLs" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 4854926e..08480a61 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -18,6 +18,7 @@ from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator @@ -51,6 +52,15 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) + async def get_pdap_client(self): + return PDAPClient( + access_manager=AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY"), + ), + ) + async def get_url_html_task_operator(self): self.logger.info("Running URL HTML Task") operator = URLHTMLTaskOperator( @@ -76,13 +86,7 @@ async def get_url_record_type_task_operator(self): return operator async def get_agency_identification_task_operator(self): - pdap_client = PDAPClient( - access_manager=AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), - api_key=get_from_env("PDAP_API_KEY"), - ), - ) + pdap_client = await self.get_pdap_client() muckrock_api_interface = MuckrockAPIInterface() operator = AgencyIdentificationTaskOperator( adb_client=self.adb_client, @@ -91,12 +95,22 @@ async def get_agency_identification_task_operator(self): ) return operator + async def get_submit_approved_url_task_operator(self): + pdap_client = await self.get_pdap_client() + operator = SubmitApprovedURLTaskOperator( + adb_client=self.adb_client, + pdap_client=pdap_client + ) + return operator + + async def get_task_operators(self) -> list[TaskOperatorBase]: return [ await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), - await self.get_agency_identification_task_operator() + await self.get_agency_identification_task_operator(), + await self.get_submit_approved_url_task_operator(), ] async def run_tasks(self): diff --git a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py new file mode 100644 index 00000000..ee1b8dc6 --- /dev/null +++ b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py @@ -0,0 +1,11 @@ +from typing import Optional + +from pydantic import BaseModel + +from core.enums import RecordType + + +class SubmitApprovedURLTDO(BaseModel): + url: str + record_type: RecordType + agency_id: Optional[int] \ No newline at end of file diff --git a/core/README.md b/core/README.md index 25b1cde3..9546f613 100644 --- a/core/README.md +++ b/core/README.md @@ -11,4 +11,5 @@ The Source Collector Core is a directory which integrates: - **Cycle**: Refers to the overall lifecycle for Each URL -- from initial retrieval in a Batch to either disposal or incorporation into the Data Sources App Database - **Task**: A semi-independent operation performed on a set of URLs. These include: Collection, retrieving HTML data, getting metadata via Machine Learning, and so on. - **Task Set**: Refers to a group of URLs that are operated on together as part of a single task. These URLs in a set are not necessarily all from the same batch. URLs in a task set should only be operated on in that task once. -- **Task Operator**: A class which performs a single task on a set of URLs. \ No newline at end of file +- **Task Operator**: A class which performs a single task on a set of URLs. +- **Subtask**: A subcomponent of a Task Operator which performs a single operation on a single URL. Often distinguished by the Collector Strategy used for that URL. \ No newline at end of file diff --git a/core/classes/SubmitApprovedURLTaskOperator.py b/core/classes/SubmitApprovedURLTaskOperator.py new file mode 100644 index 00000000..633f8c1e --- /dev/null +++ b/core/classes/SubmitApprovedURLTaskOperator.py @@ -0,0 +1,28 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import TaskType +from core.classes.TaskOperatorBase import TaskOperatorBase +from pdap_api_client.PDAPClient import PDAPClient + + +class SubmitApprovedURLTaskOperator(TaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + + @property + def task_type(self): + return TaskType.SUBMIT_APPROVED + + async def meets_task_prerequisites(self): + return await self.adb_client.has_validated_urls() + + async def inner_task_logic(self): + raise NotImplementedError + + async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): + raise NotImplementedError \ No newline at end of file diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py index c39ba1e8..1020f365 100644 --- a/pdap_api_client/AccessManager.py +++ b/pdap_api_client/AccessManager.py @@ -5,8 +5,8 @@ from aiohttp import ClientSession from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo +from util.helper_functions import get_from_env -API_URL = "https://data-sources-v2.pdap.dev/api" request_methods = { RequestType.POST: ClientSession.post, RequestType.PUT: ClientSession.put, @@ -23,7 +23,8 @@ def build_url( namespace: Namespaces, subdomains: Optional[list[str]] = None ): - url = f"{API_URL}/{namespace.value}" + api_url = get_from_env('PDAP_API_URL') + url = f"{api_url}/{namespace.value}" if subdomains is not None: url = f"{url}/{'/'.join(subdomains)}" return url From d0522c33e5b5b478b250bb45ac7c92048a7bd8d9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 25 Mar 2025 14:54:34 -0400 Subject: [PATCH 064/182] feat(app): Add Miscellaneous URL Metadata Task Add a task for adding miscellaneous URL metadata to the URL properties, utilizing metadata derived from the collector --- ..._add_name_description_and_url_optional_.py | 62 +++ apply_migrations.py | 7 +- collector_db/AsyncDatabaseClient.py | 155 +++---- collector_db/StatementComposer.py | 70 ++- collector_db/enums.py | 1 + collector_db/models.py | 41 +- core/AsyncCore.py | 11 +- .../URLMiscellaneousMetadataTDO.py | 16 + core/classes/TaskOperatorBase.py | 5 +- .../URLMiscellaneousMetadataTaskOperator.py | 62 +++ .../AutoGooglerMiscMetadataSubtask.py | 10 + .../CKANMiscMetadataSubtask.py | 13 + .../MiscellaneousMetadataSubtaskBase.py | 10 + .../MuckrockMiscMetadataSubtask.py | 10 + .../MiscellaneousMetadata}/__init__.py | 0 html_tag_collector/URLRequestInterface.py | 8 +- .../DTOs/LabelStudioTaskExportInfo.py | 39 -- .../LabelStudioAPIManager.py | 325 -------------- label_studio_interface/LabelStudioConfig.py | 32 -- .../PreAnnotationCreator.py | 88 ---- label_studio_interface/README.md | 28 -- label_studio_interface/__init__.py | 0 label_studio_interface/basic_demonstration.py | 126 ------ label_studio_interface/dev.env | 3 - local_database/DataDumper/dump.sh | 2 +- local_database/DataDumper/restore.sh | 8 +- start_mirrored_local_app.py | 402 ++++++++++++++++++ tests/helpers/DBDataCreator.py | 4 +- .../integration/tasks/test_url_html_task.py | 2 - .../test_url_miscellaneous_metadata_task.py | 143 +++++++ 30 files changed, 873 insertions(+), 810 deletions(-) create mode 100644 alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py create mode 100644 core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py create mode 100644 core/classes/URLMiscellaneousMetadataTaskOperator.py create mode 100644 core/classes/subtasks/MiscellaneousMetadata/AutoGooglerMiscMetadataSubtask.py create mode 100644 core/classes/subtasks/MiscellaneousMetadata/CKANMiscMetadataSubtask.py create mode 100644 core/classes/subtasks/MiscellaneousMetadata/MiscellaneousMetadataSubtaskBase.py create mode 100644 core/classes/subtasks/MiscellaneousMetadata/MuckrockMiscMetadataSubtask.py rename {label_studio_interface/DTOs => core/classes/subtasks/MiscellaneousMetadata}/__init__.py (100%) delete mode 100644 label_studio_interface/DTOs/LabelStudioTaskExportInfo.py delete mode 100644 label_studio_interface/LabelStudioAPIManager.py delete mode 100644 label_studio_interface/LabelStudioConfig.py delete mode 100644 label_studio_interface/PreAnnotationCreator.py delete mode 100644 label_studio_interface/README.md delete mode 100644 label_studio_interface/__init__.py delete mode 100644 label_studio_interface/basic_demonstration.py delete mode 100644 label_studio_interface/dev.env create mode 100644 start_mirrored_local_app.py create mode 100644 tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py diff --git a/alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py b/alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py new file mode 100644 index 00000000..e8b542f9 --- /dev/null +++ b/alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py @@ -0,0 +1,62 @@ +"""Add name, description, and url optional data source metadata + +Revision ID: 6eb8084e2f48 +Revises: 69f7cc4f56d4 +Create Date: 2025-03-15 17:45:46.619721 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '6eb8084e2f48' +down_revision: Union[str, None] = '69f7cc4f56d4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add name and description columns to URL table + op.add_column('urls', sa.Column('name', sa.String(), nullable=True)) + op.add_column('urls', sa.Column('description', sa.String(), nullable=True)) + + # Create URL_optional_data_source_metadata + op.create_table( + 'url_optional_data_source_metadata', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.Column('record_formats', sa.ARRAY(sa.String()), nullable=True), + sa.Column('data_portal_type', sa.String(), nullable=True), + sa.Column('supplying_entity', sa.String(), nullable=True), + sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ), + sa.PrimaryKeyConstraint('id') + ) + + # Add 'Misc Metadata' to TaskType enum + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification', 'Misc Metadata'] + ) + + +def downgrade() -> None: + # Remove name and description columns from URL table + op.drop_column('urls', 'name') + op.drop_column('urls', 'description') + + # Drop URL_optional_data_source_metadata + op.drop_table('url_optional_data_source_metadata') + + # Remove 'Misc Metadata' from TaskType enum + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification'] + ) diff --git a/apply_migrations.py b/apply_migrations.py index 5be4cd99..183e7d11 100644 --- a/apply_migrations.py +++ b/apply_migrations.py @@ -3,7 +3,7 @@ from collector_db.helper_functions import get_postgres_connection_string -if __name__ == "__main__": +def apply_migrations(): print("Applying migrations...") alembic_config = Config("alembic.ini") alembic_config.set_main_option( @@ -11,4 +11,7 @@ get_postgres_connection_string() ) command.upgrade(alembic_config, "head") - print("Migrations applied.") \ No newline at end of file + print("Migrations applied.") + +if __name__ == "__main__": + apply_migrations() \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 82fedc93..41685a4b 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2,7 +2,7 @@ from typing import Optional, Type from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_ +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute from sqlalchemy.sql.functions import coalesce @@ -22,7 +22,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ApprovingUserURL + UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata from collector_manager.enums import URLStatus, CollectorType from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo @@ -34,6 +34,7 @@ GetURLsResponseInnerInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info @@ -353,6 +354,68 @@ async def has_pending_urls_without_html_data(self, session: AsyncSession) -> boo scalar_result = await session.scalars(statement) return bool(scalar_result.first()) + @session_manager + async def has_pending_urls_missing_miscellaneous_metadata(self, session: AsyncSession) -> bool: + query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query() + query = query.limit(1) + + scalar_result = await session.scalars(query) + return bool(scalar_result.first()) + + @session_manager + async def get_pending_urls_missing_miscellaneous_metadata( + self, + session: AsyncSession + ) -> list[URLMiscellaneousMetadataTDO]: + query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query() + query = ( + query.options( + selectinload(URL.batch), + ).limit(100).order_by(URL.id) + ) + + scalar_result = await session.scalars(query) + all_results = scalar_result.all() + final_results = [] + for result in all_results: + tdo = URLMiscellaneousMetadataTDO( + url_id=result.id, + collector_metadata=result.collector_metadata, + collector_type=CollectorType(result.batch.strategy), + ) + final_results.append(tdo) + return final_results + + @session_manager + async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URLMiscellaneousMetadataTDO]): + updates = [] + + for tdo in tdos: + update_query = update( + URL + ).where( + URL.id == tdo.url_id + ).values( + name=tdo.name, + description=tdo.description, + ) + + updates.append(update_query) + + for stmt in updates: + await session.execute(stmt) + + for tdo in tdos: + metadata_object = URLOptionalDataSourceMetadata( + url_id=tdo.url_id, + record_formats=tdo.record_formats, + data_portal_type=tdo.data_portal_type, + supplying_entity=tdo.supplying_entity + ) + session.add(metadata_object) + + + @session_manager async def get_pending_urls_without_html_data(self, session: AsyncSession): # TODO: Add test that includes some urls WITH html data. Check they're not returned @@ -433,97 +496,15 @@ async def has_urls_with_html_data_and_without_auto_relevant_suggestion(self, ses ) - #TODO: Slated for deletion - @session_manager - async def get_urls_with_html_data_and_without_metadata_type( - self, - session: AsyncSession, - without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT - ) -> list[URLWithHTML]: - - # Get URLs with no relevancy metadata - statement = (select(URL) - .options(selectinload(URL.html_content)) - .where(URL.outcome == URLStatus.PENDING.value)) - # Exclude URLs with auto suggested record types - statement = self.statement_composer.exclude_urls_with_extant_model( - statement=statement, - model=AutoRecordTypeSuggestion - ) - statement = statement.limit(100).order_by(URL.id) - - - # TODO: The below can probably be generalized - - - statement = self.statement_composer.exclude_urls_with_select_metadata( - statement=statement, - attribute=without_metadata_type - ) - # TODO: Generalize - statement = statement.limit(100).order_by(URL.id) - raw_result = await session.execute(statement) - result = raw_result.all() - url_ids_to_urls = {url_id: url for url_id, url, _ in result} - url_ids_to_html_info = {url_id: [] for url_id, _, _ in result} - - for url_id, _, html_info in result: - url_ids_to_html_info[url_id].append( - URLHTMLContentInfo(**html_info.__dict__) - ) - - final_results = [] - for url_id, url in url_ids_to_urls.items(): - url_with_html = URLWithHTML( - url_id=url_id, - url=url, - html_infos=url_ids_to_html_info[url_id] - ) - final_results.append(url_with_html) - - return final_results - - @session_manager - async def has_pending_urls_with_html_data_and_without_metadata_type( - self, - session: AsyncSession, - without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT - ) -> bool: - # TODO: Generalize this so that it can exclude based on other attributes - # Get URLs with no relevancy metadata - statement = (select(URL.id, URL.url, URLHTMLContent). - join(URLHTMLContent). - where(URL.outcome == URLStatus.PENDING.value)) - statement = self.statement_composer.exclude_urls_with_select_metadata( - statement=statement, - attribute=without_metadata_type - ) - statement = statement.limit(1) - raw_result = await session.execute(statement) - result = raw_result.all() - return len(result) > 0 - - - - # @session_manager - # async def get_annotations_for_metadata_id( - # self, - # session: AsyncSession, - # metadata_id: int - # ) -> list[MetadataAnnotation]: - # statement = (select(MetadataAnnotation). - # where(MetadataAnnotation.metadata_id == metadata_id)) - # scalar_result = await session.scalars(statement) - # all_results = scalar_result.all() - # return [MetadataAnnotationInfo(**result.__dict__) for result in all_results] - @session_manager - async def get_all(self, session, model: Base): + async def get_all(self, session, model: Base, order_by_attribute: Optional[str] = None) -> list[Base]: """ Get all records of a model Used primarily in testing """ statement = select(model) + if order_by_attribute: + statement = statement.order_by(getattr(model, order_by_attribute)) result = await session.execute(statement) return result.scalars().all() diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index d69dd078..88da61f3 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,11 +1,11 @@ from typing import Any -from sqlalchemy import Select, select, exists, Table, func, Subquery +from sqlalchemy import Select, select, exists, Table, func, Subquery, and_ from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus -from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion -from collector_manager.enums import URLStatus +from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch +from collector_manager.enums import URLStatus, CollectorType class StatementComposer: @@ -36,35 +36,7 @@ def exclude_urls_with_extant_model( ) )) - @staticmethod - def exclude_urls_with_select_metadata( - statement: Select, - attribute: URLMetadataAttributeType - ) -> Select: - return (statement.where( - ~exists( - select(URLMetadata.id). - where( - URLMetadata.url_id == URL.id, - URLMetadata.attribute == attribute.value - ) - ) - )) - @staticmethod - def exclude_url_annotated_by_user( - statement: Select, - user_id: int - ) -> Select: - return (statement.where( - ~exists( - select(MetadataAnnotation.id). - where( - MetadataAnnotation.metadata_id == URLMetadata.id, - MetadataAnnotation.user_id == user_id - ) - ) - )) @staticmethod @@ -88,19 +60,29 @@ def exclude_urls_with_agency_suggestions( return statement + @staticmethod - async def get_all_html_content_for_url(subquery) -> Select: - statement = ( - select( - subquery.c.url, - subquery.c.metadata_id, - subquery.c.value, - URLHTMLContent.content_type, - URLHTMLContent.content, + def pending_urls_missing_miscellaneous_metadata_query() -> Select: + query = select(URL).where( + and_( + URL.outcome == URLStatus.PENDING.value, + URL.name == None, + URL.description == None, + URLOptionalDataSourceMetadata.url_id == None, + Batch.strategy.in_( + [ + CollectorType.AUTO_GOOGLER.value, + CollectorType.CKAN.value, + CollectorType.MUCKROCK_ALL_SEARCH.value, + CollectorType.MUCKROCK_COUNTY_SEARCH.value, + CollectorType.MUCKROCK_SIMPLE_SEARCH.value + ] + ) + ) + ).outerjoin( + URLOptionalDataSourceMetadata + ).join( + Batch ) - .join(URLHTMLContent) - .where(subquery.c.url_id == URLHTMLContent.url_id) - ) - raw_result = await session.execute(statement) - result = raw_result.all() \ No newline at end of file + return query \ No newline at end of file diff --git a/collector_db/enums.py b/collector_db/enums.py index 2d82e87b..0dd956c5 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -37,6 +37,7 @@ class TaskType(PyEnum): RELEVANCY = "Relevancy" RECORD_TYPE = "Record Type" AGENCY_IDENTIFICATION = "Agency Identification" + MISC_METADATA = "Misc Metadata" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/collector_db/models.py b/collector_db/models.py index 7990eb65..4a61be88 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -2,11 +2,11 @@ SQLAlchemy ORM models """ from sqlalchemy import func, Column, Integer, String, TIMESTAMP, Float, JSON, ForeignKey, Text, UniqueConstraint, \ - Boolean, DateTime + Boolean, DateTime, ARRAY from sqlalchemy.dialects import postgresql from sqlalchemy.orm import declarative_base, relationship -from collector_db.enums import PGEnum +from collector_db.enums import PGEnum, TaskType from core.enums import BatchStatus, RecordType from util.helper_functions import get_enum_values @@ -85,6 +85,8 @@ class URL(Base): # The batch this URL is associated with batch_id = Column(Integer, ForeignKey('batches.id', name='fk_url_batch_id'), nullable=False) url = Column(Text, unique=True) + name = Column(String) + description = Column(Text) # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. @@ -130,6 +132,20 @@ class URL(Base): "UserRelevantSuggestion", back_populates="url") approving_users = relationship( "ApprovingUserURL", back_populates="url") + optional_data_source_metadata = relationship( + "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") + +class URLOptionalDataSourceMetadata(Base): + __tablename__ = 'url_optional_data_source_metadata' + + id = Column(Integer, primary_key=True) + url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + record_formats = Column(ARRAY(String), nullable=True) + data_portal_type = Column(String, nullable=True) + supplying_entity = Column(String, nullable=True) + + # Relationships + url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") class ApprovingUserURL(Base): __tablename__ = 'approving_user_url' @@ -256,10 +272,7 @@ class Task(Base): id = Column(Integer, primary_key=True) task_type = Column( PGEnum( - 'HTML', - 'Relevancy', - 'Record Type', - 'Agency Identification', + *[task_type.value for task_type in TaskType], name='task_type' ), nullable=False) task_status = Column(batch_status_enum, nullable=False) @@ -319,22 +332,6 @@ class Agency(Base): automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") - -# class ConfirmedUrlAgency(Base): -# __tablename__ = "confirmed_url_agency" -# -# id = Column(Integer, primary_key=True, autoincrement=True) -# agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) -# url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) -# -# agency = relationship("Agency", back_populates="confirmed_urls") -# url = relationship("URL", back_populates="confirmed_agencies") -# -# __table_args__ = ( -# UniqueConstraint("url_id", name="uq_confirmed_url_agency"), -# ) - - class AutomatedUrlAgencySuggestion(Base): __tablename__ = "automated_url_agency_suggestions" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 4854926e..5317023b 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -20,6 +20,7 @@ from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from core.enums import BatchStatus, SuggestionType, RecordType @@ -91,12 +92,19 @@ async def get_agency_identification_task_operator(self): ) return operator + async def get_url_miscellaneous_metadata_task_operator(self): + operator = URLMiscellaneousMetadataTaskOperator( + adb_client=self.adb_client + ) + return operator + async def get_task_operators(self) -> list[TaskOperatorBase]: return [ await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), - await self.get_agency_identification_task_operator() + await self.get_agency_identification_task_operator(), + await self.get_url_miscellaneous_metadata_task_operator() ] async def run_tasks(self): @@ -225,3 +233,4 @@ async def approve_and_get_next_source_for_review( user_id=access_info.user_id ) return await self.get_next_source_for_review() + diff --git a/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py b/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py new file mode 100644 index 00000000..d57d1cba --- /dev/null +++ b/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py @@ -0,0 +1,16 @@ +from typing import Optional + +from pydantic import BaseModel + +from collector_manager.enums import CollectorType + + +class URLMiscellaneousMetadataTDO(BaseModel): + url_id: int + collector_metadata: dict + collector_type: CollectorType + name: Optional[str] = None + description: Optional[str] = None + record_formats: Optional[list[str]] = None + data_portal_type: Optional[str] = None + supplying_entity: Optional[str] = None diff --git a/core/classes/TaskOperatorBase.py b/core/classes/TaskOperatorBase.py index ece3bc81..e7c87dac 100644 --- a/core/classes/TaskOperatorBase.py +++ b/core/classes/TaskOperatorBase.py @@ -1,4 +1,4 @@ - +import traceback from abc import ABC, abstractmethod from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType @@ -50,9 +50,10 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: await self.inner_task_logic() return await self.conclude_task() except Exception as e: + stack_trace = traceback.format_exc() return await self.run_info( outcome=TaskOperatorOutcome.ERROR, - message=str(e) + message=str(e) + "\n" + stack_trace ) async def run_info(self, outcome: TaskOperatorOutcome, message: str): diff --git a/core/classes/URLMiscellaneousMetadataTaskOperator.py b/core/classes/URLMiscellaneousMetadataTaskOperator.py new file mode 100644 index 00000000..38e7446a --- /dev/null +++ b/core/classes/URLMiscellaneousMetadataTaskOperator.py @@ -0,0 +1,62 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo +from collector_db.enums import TaskType +from collector_manager.enums import CollectorType +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO +from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.subtasks.MiscellaneousMetadata.AutoGooglerMiscMetadataSubtask import AutoGooglerMiscMetadataSubtask +from core.classes.subtasks.MiscellaneousMetadata.CKANMiscMetadataSubtask import CKANMiscMetadataSubtask +from core.classes.subtasks.MiscellaneousMetadata.MiscellaneousMetadataSubtaskBase import \ + MiscellaneousMetadataSubtaskBase +from core.classes.subtasks.MiscellaneousMetadata.MuckrockMiscMetadataSubtask import MuckrockMiscMetadataSubtask + + +class URLMiscellaneousMetadataTaskOperator(TaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient + ): + super().__init__(adb_client) + + @property + def task_type(self): + return TaskType.MISC_METADATA + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_missing_miscellaneous_metadata() + + async def get_subtask(self, collector_type: CollectorType) -> MiscellaneousMetadataSubtaskBase: + match collector_type: + case CollectorType.MUCKROCK_SIMPLE_SEARCH: + return MuckrockMiscMetadataSubtask() + case CollectorType.MUCKROCK_COUNTY_SEARCH: + return MuckrockMiscMetadataSubtask() + case CollectorType.MUCKROCK_ALL_SEARCH: + return MuckrockMiscMetadataSubtask() + case CollectorType.AUTO_GOOGLER: + return AutoGooglerMiscMetadataSubtask() + case CollectorType.CKAN: + return CKANMiscMetadataSubtask() + case _: + raise Exception(f"Unknown collector type: {collector_type}") + + async def inner_task_logic(self): + tdos: list[URLMiscellaneousMetadataTDO] = await self.adb_client.get_pending_urls_missing_miscellaneous_metadata() + await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) + + error_infos = [] + for tdo in tdos: + subtask = await self.get_subtask(tdo.collector_type) + try: + subtask.process(tdo) + except Exception as e: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_id, + error=str(e), + ) + error_infos.append(error_info) + + await self.adb_client.add_miscellaneous_metadata(tdos) + await self.adb_client.add_url_error_infos(error_infos) \ No newline at end of file diff --git a/core/classes/subtasks/MiscellaneousMetadata/AutoGooglerMiscMetadataSubtask.py b/core/classes/subtasks/MiscellaneousMetadata/AutoGooglerMiscMetadataSubtask.py new file mode 100644 index 00000000..43659a9e --- /dev/null +++ b/core/classes/subtasks/MiscellaneousMetadata/AutoGooglerMiscMetadataSubtask.py @@ -0,0 +1,10 @@ +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO +from core.classes.subtasks.MiscellaneousMetadata.MiscellaneousMetadataSubtaskBase import \ + MiscellaneousMetadataSubtaskBase + + +class AutoGooglerMiscMetadataSubtask(MiscellaneousMetadataSubtaskBase): + + def process(self, tdo: URLMiscellaneousMetadataTDO): + tdo.name = tdo.collector_metadata['title'] + tdo.description = tdo.collector_metadata['snippet'] \ No newline at end of file diff --git a/core/classes/subtasks/MiscellaneousMetadata/CKANMiscMetadataSubtask.py b/core/classes/subtasks/MiscellaneousMetadata/CKANMiscMetadataSubtask.py new file mode 100644 index 00000000..04ef7a0f --- /dev/null +++ b/core/classes/subtasks/MiscellaneousMetadata/CKANMiscMetadataSubtask.py @@ -0,0 +1,13 @@ +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO +from core.classes.subtasks.MiscellaneousMetadata.MiscellaneousMetadataSubtaskBase import \ + MiscellaneousMetadataSubtaskBase + + +class CKANMiscMetadataSubtask(MiscellaneousMetadataSubtaskBase): + + def process(self, tdo: URLMiscellaneousMetadataTDO): + tdo.name = tdo.collector_metadata['submitted_name'] + tdo.description = tdo.collector_metadata['description'] + tdo.record_formats = tdo.collector_metadata['record_format'] + tdo.data_portal_type = tdo.collector_metadata['data_portal_type'] + tdo.supplying_entity = tdo.collector_metadata['supplying_entity'] diff --git a/core/classes/subtasks/MiscellaneousMetadata/MiscellaneousMetadataSubtaskBase.py b/core/classes/subtasks/MiscellaneousMetadata/MiscellaneousMetadataSubtaskBase.py new file mode 100644 index 00000000..7a0e7d1f --- /dev/null +++ b/core/classes/subtasks/MiscellaneousMetadata/MiscellaneousMetadataSubtaskBase.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO + + +class MiscellaneousMetadataSubtaskBase(ABC): + + @abstractmethod + def process(self, tdo: URLMiscellaneousMetadataTDO): + raise NotImplementedError \ No newline at end of file diff --git a/core/classes/subtasks/MiscellaneousMetadata/MuckrockMiscMetadataSubtask.py b/core/classes/subtasks/MiscellaneousMetadata/MuckrockMiscMetadataSubtask.py new file mode 100644 index 00000000..1d599162 --- /dev/null +++ b/core/classes/subtasks/MiscellaneousMetadata/MuckrockMiscMetadataSubtask.py @@ -0,0 +1,10 @@ +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO +from core.classes.subtasks.MiscellaneousMetadata.MiscellaneousMetadataSubtaskBase import \ + MiscellaneousMetadataSubtaskBase + + +class MuckrockMiscMetadataSubtask(MiscellaneousMetadataSubtaskBase): + + def process(self, tdo: URLMiscellaneousMetadataTDO): + tdo.name = tdo.collector_metadata['title'] + tdo.description = tdo.collector_metadata['title'] diff --git a/label_studio_interface/DTOs/__init__.py b/core/classes/subtasks/MiscellaneousMetadata/__init__.py similarity index 100% rename from label_studio_interface/DTOs/__init__.py rename to core/classes/subtasks/MiscellaneousMetadata/__init__.py diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index 2b135516..20ea1989 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -79,10 +79,8 @@ async def make_requests( self, urls: list[str], ) -> list[URLResponseInfo]: - try: - ensure_browsers_installed() - return await self.fetch_urls(urls) - except Exception as e: - return [] + ensure_browsers_installed() + return await self.fetch_urls(urls) + diff --git a/label_studio_interface/DTOs/LabelStudioTaskExportInfo.py b/label_studio_interface/DTOs/LabelStudioTaskExportInfo.py deleted file mode 100644 index 07c0562b..00000000 --- a/label_studio_interface/DTOs/LabelStudioTaskExportInfo.py +++ /dev/null @@ -1,39 +0,0 @@ -from pydantic import BaseModel - -from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType - - -class LabelStudioTaskExportInfo(BaseModel): - url: str - html_title: str = "" - meta_description: str = "" - h1: list[str] = [] - h2: list[str] = [] - h3: list[str] = [] - h4: list[str] = [] - h5: list[str] = [] - h6: list[str] = [] - div_text: str = "" - url_path: str = "" - http_response: int = -1 - url_source_info: str = "" - -ENUM_TO_ATTRIBUTE_MAPPING = { - HTMLContentType.TITLE: "html_title", - HTMLContentType.DESCRIPTION: "meta_description", - HTMLContentType.H1: "h1", - HTMLContentType.H2: "h2", - HTMLContentType.H3: "h3", - HTMLContentType.H4: "h4", - HTMLContentType.H5: "h5", - HTMLContentType.H6: "h6", - HTMLContentType.DIV: "div_text" -} - -def add_html_info_to_export_info( - export_info: LabelStudioTaskExportInfo, - html_content_info: URLHTMLContentInfo -): - attribute_name = ENUM_TO_ATTRIBUTE_MAPPING[html_content_info.content_type] - setattr(export_info, attribute_name, html_content_info.content) - diff --git a/label_studio_interface/LabelStudioAPIManager.py b/label_studio_interface/LabelStudioAPIManager.py deleted file mode 100644 index 138dd0cd..00000000 --- a/label_studio_interface/LabelStudioAPIManager.py +++ /dev/null @@ -1,325 +0,0 @@ -import copy -import json -import os -import random -import string -import sys -from enum import Enum -from typing import Annotated - -import requests - -from label_studio_interface.DTOs.LabelStudioTaskExportInfo import LabelStudioTaskExportInfo - -# The below code sets the working directory to be the root of the entire repository -# This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from label_studio_interface.LabelStudioConfig import LabelStudioConfig - -""" -This script contains code which interfaces with the Label Studio API. -To view the documentation for the Label Studio API, visit https://app.heartex.com/docs/api -""" - - -class Role(Enum): - """ - This class represents the roles that a user can have in an organization. - """ - OWNER = "OW" - ADMINISTRATOR = "AD" - MANAGER = "MA" - REVIEWER = "RE" - ANNOTATOR = "AN" - DEACTIVATED = "DI" - NONE = "NO" - -def generate_random_word(length): - letters = string.ascii_lowercase - return ''.join(random.choice(letters) for _ in range(length)) - - -class URLConstructor: - def __init__(self, scheme="http", domain=None): - self.scheme = scheme - self.domain = domain - self.path_segments = [] - self.query_params = {} - - def add_path_segment(self, segment): - self.path_segments.append(segment) - return self - - def add_query_param(self, key, value): - self.query_params[key] = value - return self - - def build(self): - path = "/".join(self.path_segments) - query_string = "&".join([f"{key}={value}" for key, value in self.query_params.items()]) - url = f"{self.scheme}://{self.domain}" - if path: - url += f"/{path}" - if query_string: - url += f"?{query_string}" - return url - - -class LabelStudioAPIURLConstructor: - """ - This class is responsible for constructing the URL for the Label Studio API. - """ - - def __init__(self, project_id: str = '58475', organization_id: str = '1'): - self.base_url_constructor = URLConstructor( - domain='app.heartex.com', - scheme='https' - ).add_path_segment('api') - self.project_id = project_id - self.organization_id = organization_id - # self.label_studio_api_root_url = 'https://app.heartex.com/api' - # self.label_studio_api_root_url = f'https://app.heartex.com/api/projects/{project_id}' - - def get_import_url(self) -> str: - """ - This method returns the URL for importing data into Label Studio. - Returns: str - """ - new_constructor = copy.deepcopy(self.base_url_constructor) - return (new_constructor - .add_path_segment('projects') - .add_path_segment(self.project_id) - .add_path_segment('import') - .add_query_param('return_task_ids', 'true') - .build() - ) - - def get_project_url(self) -> str: - """ - This method returns the URL for the project. - Returns: str - """ - new_constructor = copy.deepcopy(self.base_url_constructor) - return (new_constructor - .add_path_segment('projects') - .add_path_segment(self.project_id) - .build() - ) - - def delete_project_tasks_url(self) -> str: - """ - This method returns the URL for deleting all tasks in the project. - Returns: str - """ - new_constructor = copy.deepcopy(self.base_url_constructor) - return (new_constructor - .add_path_segment('projects') - .add_path_segment(self.project_id) - .add_path_segment('ground-truth-tasks') - .build() - ) - - def get_easy_export_url(self, all_tasks: bool) -> str: - """ - This method returns the URL for the easy export. - Returns: str - """ - new_constructor = copy.deepcopy(self.base_url_constructor) - return (new_constructor - .add_path_segment('projects') - .add_path_segment(self.project_id) - .add_path_segment('export') - .add_query_param('exportType', 'JSON') - .add_query_param('download_all_tasks', str(all_tasks).lower()) - .build() - ) - - def get_organization_membership_url(self) -> str: - """ - This method returns the URL for organization membership - Used for querying the members in the organization as well as updating the role of a member. - Returns: str - """ - new_constructor = copy.deepcopy(self.base_url_constructor) - return (new_constructor - .add_path_segment('organizations') - .add_path_segment(self.organization_id) - .add_path_segment('memberships') - .build() - ) - - -class LabelStudioAPIManager: - - def __init__( - self, - config: LabelStudioConfig = LabelStudioConfig(), - ): - """ - This class is responsible for managing the API requests to Label Studio. - Args: - config: The user's authorization token for the Label Studio API. - """ - self.config = config - self.api_url_constructor = LabelStudioAPIURLConstructor( - project_id=self.config.project_id, - organization_id=self.config.organization_id - ) - - # region Task Import/Export - def export_tasks_into_project( - self, - data: list[LabelStudioTaskExportInfo] - ) -> Annotated[list[int], "The task IDs"]: - """ - This method imports task input data into Label Studio. - https://labelstud.io/api#tag/Import/operation/api_projects_import_create - Args: - data: dict - The data to import into Label Studio. - This should be a list of dictionaries, each containing - the same keys, representing data for the task - Returns: requests.Response - """ - dict_data = [] - for task in data: - dict_data.append(task.model_dump()) - import_url = self.api_url_constructor.get_import_url() - response = requests.post( - url=import_url, - data=json.dumps(dict_data), - # TODO: Consider extracting header construction - headers={ - 'Content-Type': 'application/json', - 'Authorization': self.config.authorization_token - } - ) - response.raise_for_status() - return response.json()["task_ids"] - - def import_tasks_from_project(self, all_tasks: bool = False) -> requests.Response: - """ - This method exports the data from the project. - Args: - all_tasks: bool - Whether to export all tasks or just the annotated tasks. - output_filename: str - The filename to save the exported data to. - Returns: requests.Response - """ - export_url = self.api_url_constructor.get_easy_export_url(all_tasks=all_tasks) - response = requests.get( - url=export_url, - headers={ - 'Authorization': self.config.authorization_token - } - ) - response.raise_for_status() - return response - - # endregion - - # region Project Information - def get_project_info(self) -> requests.Response: - """ - This method retrieves information about the project. - Returns: requests.Response - """ - project_url = self.api_url_constructor.get_project_url() - response = requests.get( - url=project_url, - headers={ - 'Authorization': self.config.authorization_token - } - ) - return response - - def ping_project(self) -> bool: - """ - This method pings the project, returning True if the project is accessible. - Returns: bool - """ - project_url = self.api_url_constructor.get_project_url() - response = requests.get( - url=project_url, - headers={ - 'Authorization': self.config.authorization_token - } - ) - return response.status_code == 200 - - # endregion - - # region User Management - def get_members_in_organization(self) -> requests.Response: - """ - This method retrieves the members in the organization. - https://app.heartex.com/docs/api#tag/Organizations/operation/api_organizations_memberships_list - Returns: requests.Response - """ - membership_url = self.api_url_constructor.get_organization_membership_url() - response = requests.get( - url=membership_url, - headers={ - 'Authorization': self.config.authorization_token - } - ) - response.raise_for_status() - return response - - def update_member_role(self, user_id: int, role: Role) -> requests.Response: - """ - This method updates the role of a member in the organization. - Args: - user_id: str - The ID of the user to update the role for. - role: Role - The role to update the user to. - Returns: requests.Response - """ - membership_url = self.api_url_constructor.get_organization_membership_url() - response = requests.patch( - url=membership_url, - headers={ - 'Authorization': self.config.authorization_token, - 'Content-Type': 'application/json' - }, - json={ - "user_id": user_id, - "role": role.value - } - ) - return response - - def delete_project_tasks(self) -> requests.Response: - """ - This method deletes all tasks from the project. - Returns: requests.Response - """ - delete_url = self.api_url_constructor.delete_project_tasks_url() - response = requests.delete( - url=delete_url, - headers={ - 'Authorization': self.config.authorization_token - } - ) - return response - - # endregion - - -if __name__ == "__main__": - - # Example usage - api_manager = LabelStudioAPIManager(config=LabelStudioConfig()) - project_accessible = api_manager.ping_project() - if project_accessible: - print("Project is accessible") - - # Test export - # data = [{"url": f"https://example.com/{generate_random_word(10)}"} for _ in range(10)] - # - # response = api_manager.import_data(data) - # print(response.status_code) - # print(response.json()) - - # Test import - response = api_manager.import_tasks_from_project() - print(response.status_code) - print(response.json()) diff --git a/label_studio_interface/LabelStudioConfig.py b/label_studio_interface/LabelStudioConfig.py deleted file mode 100644 index 14e5cef1..00000000 --- a/label_studio_interface/LabelStudioConfig.py +++ /dev/null @@ -1,32 +0,0 @@ -import os - -from dotenv import load_dotenv - - -class LabelStudioConfig: - """ - This class is responsible for loading the configuration for the Label Studio API. - """ - def __init__(self, dotenv_file=".env"): - """ - - Args: - dotenv_file: the path to the .env file which contains the configuration for the Label Studio API - """ - load_dotenv(dotenv_file) - # Note that if the environment variables are not set, the default values, given below, are used - self._project_id = os.getenv('LABEL_STUDIO_PROJECT_ID', '58475') - self._organization_id = os.getenv('LABEL_STUDIO_ORGANIZATION_ID', '1') - self._authorization_token = f'Token {os.getenv("LABEL_STUDIO_ACCESS_TOKEN", "abc123")}' - - @property - def project_id(self): - return self._project_id - - @property - def authorization_token(self): - return self._authorization_token - - @property - def organization_id(self): - return self._organization_id diff --git a/label_studio_interface/PreAnnotationCreator.py b/label_studio_interface/PreAnnotationCreator.py deleted file mode 100644 index 9630d464..00000000 --- a/label_studio_interface/PreAnnotationCreator.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -This class combines data with pre-annotation data, converting it into the requisite format for Label Studio -""" -from typing import Any - -class BaseResultInfo: - """ - Contains information required for every result - """ - def __init__(self, result_type: str, to_name: str, from_name: str, origin: str = "manual"): - """ - - Args: - result_type: One of the permitted Label Studio result types - to_name: Name of the entity being labeled - from_name: Source name of the result in the label configuration - origin: Where the result came from, defaults to "manual" - """ - self.result_type = result_type - self.to_name = to_name - self.from_name = from_name - self.origin = origin - -class TaxonomyResult: - - def __init__(self, base_info: BaseResultInfo, taxonomy_data: list[list[str]]): - self.base_info = base_info - self.taxonomy_data = taxonomy_data - - def to_dict(self) -> dict: - """ - Converts the taxonomy data to a dictionary - Returns: - - """ - return { - "type": self.base_info.result_type, - "value": { - "taxonomy": self.taxonomy_data - }, - "origin": self.base_info.origin, - "to_name": self.base_info.to_name, - "from_name": self.base_info.from_name - } - - - - -class PreAnnotationCreator: - - def __init__(self): - pass - - def add_taxonomy_data(self, raw_taxonomy_data: Any) -> list[list[str]]: - """ - This method adds taxonomy data to the pre-annotation data - - Taxonomy data exists as a list of lists - Each sub-list represents a single selection in the taxonomy - and is a list of strings representing each level of the taxonomy - with the first being the most superordinate, and the last being the most subordinate - Selections do not have to include all levels of the taxonomy - For example, in a taxonomy of animals, if "Dog" is selected, the selection is represented as ["Dog"] - However, a selection of a subordinate category entails selection of all relevant superordinate categories - For example, If "German Shepherd" is selected, the selection is represented as ["Dog", "German Shepherd"] - If "Dog" is also selected, that is included as a separate sub-list containing only ["Dog"] - - Example format: - [ - ["Dog", "German Shepherd"], - ["Dog"] - ] - - Args: - raw_taxonomy_data: Any: Taxonomy data to be converted into the requisite format for Label Studio - Returns: - list[list[str]]: The pre-annotation data with the taxonomy data added - - """ - - taxonomy_results = [] - - - - - # Note that for multi-hierarchical taxonomy data, - # any selection of the subordinate category - # will automatically entail selection of the superordinate category diff --git a/label_studio_interface/README.md b/label_studio_interface/README.md deleted file mode 100644 index 491ab4d8..00000000 --- a/label_studio_interface/README.md +++ /dev/null @@ -1,28 +0,0 @@ -This directory handles interfacing with -[Label Studio](https://labelstud.io/), a data labeling tool. It handles: -- Converting data from the format used by the rest of the pipeline to the format - used by Label Studio -- Uploading data to Label Studio -- Downloading labeled data from Label Studio -- Updating member roles in Label Studio - -# Environment Variables -For proper functioning of application, the following environment variables must be set in an `.env` file in the root directory: - -- LABEL_STUDIO_ACCESS_TOKEN: The access token for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. -- LABEL_STUDIO_PROJECT_ID: The project ID for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL. -- LABEL_STUDIO_ORGANIZATION_ID: The organization ID for the Label Studio API. This can - be obtained by logging into Label Studio and navigating to the [Organization section](https://app.heartex.com/organization?page=1), where the organization ID can be copied. - -# To run basic demonstration -1. Set the environment variables as described above; in dev.env, all but LABEL_STUDIO_ACCESS_TOKEN are pre-set. -2. Install the required python libraries by running the following command (from the working directory): -```bash -pip install -r requirements.txt -``` -2. Run the following command (from the label_studio_interface_directory): -```bash -python basic_demonstration.py -``` \ No newline at end of file diff --git a/label_studio_interface/__init__.py b/label_studio_interface/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/label_studio_interface/basic_demonstration.py b/label_studio_interface/basic_demonstration.py deleted file mode 100644 index 17e3d327..00000000 --- a/label_studio_interface/basic_demonstration.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -This script will serve as a basic demonstration of the functionality of -Label Studio and the Python configuration developed. - -The script will: - 1. Load the configuration for the Label Studio API - 2. Delete all task data from the associated project in Label Studio (if any exists) - 3. Import new task data into the project - 4. Prompt the user to access Label Studio and perform review and annotation tasks - 5. Export the annotated data from Label Studio and present it to the user - -The configuration for the Label Studio API will be loaded from the dev.env file within this directory -However, the access token in the file is not valid and will need to be replaced with a valid access token - -All actions will be performed on the 'Simple URL Labeler" project viewable at https://app.heartex.com/projects/58903/ -""" - -from LabelStudioAPIManager import LabelStudioAPIManager -from LabelStudioConfig import LabelStudioConfig - -# Simple URL Labeler project URL -project_url = "https://app.heartex.com/projects/58903/" - -# Load the configuration for the Label Studio API -config = LabelStudioConfig("dev.env") -if "REPLACE_WITH_YOUR_TOKEN" in config.authorization_token: - raise ValueError("Please replace the access token in dev.env with your own access token") - -# Create an API manager -api_manager = LabelStudioAPIManager(config) - -print("Deleting project tasks...") -# Delete all task data from the associated project in Label Studio (if any exists) -api_manager.delete_project_tasks() - -# Prompt the user to access Label Studio and confirm that the project has been cleared -print(f"Please access the project at {project_url} to confirm that the project has been cleared") - -# Wait for the user to confirm that the project has been cleared -input("Press Enter once confirmed...") -print("Continuing...") - -# Import new task data into the project -# Two tasks will be imported: one which has not been annotated and one which has been pre-annotated -# These tasks are provided in their final data form, -# but will need to be converted into this form in the eventual pipeline -data = [ - { - "data": { - "url": "https://test_data.gov/test/test-services/annual-test/" - } - }, - { - "data": { - "url": "www.example.com" - }, - "annotations": [ - { - "result": [ - { - "type": "taxonomy", - "value": { - "taxonomy": [ - [ - "Police Public Interactions" - ], - [ - "Police Public Interactions", - "Accident Reports" - ], - [ - "Police Public Interactions", - "Arrest Records" - ], - [ - "Agency Published Resources" - ], - [ - "Agency Published Resources", - "Crime Maps and Reports" - ], - [ - "Non-Criminal Justice" - ] - ] - }, - "origin": "manual", - "to_name": "url_text", - "from_name": "category" - }, - { - "type": "choices", - "value": { - "choices": [ - "Y" - ] - }, - "origin": "manual", - "to_name": "is_single_record", - "from_name": "single_record_checkbox" - } - ] - } - ] - } -] -api_manager.export_tasks_into_project(data) - -# Prompt the user to access Label Studio and perform review and annotation tasks -print(f"Please access the project at {project_url} to perform review and annotation tasks") - -# Wait for the user to complete the tasks -input("Press Enter when complete...") -print("Continuing...") - -# Import the annotated data from Label Studio and present it to the user -response = api_manager.import_tasks_from_project(all_tasks=True) -print("Presenting annotated data (showing only first results)...") -results = response.json() -for result in results: - print(f"Task URL: {result['data']['url']}") - if len(result['annotations']) == 0: - print("No annotations") - else: - print(f"Annotations: {result['annotations'][0]['result']}") - print("\n") diff --git a/label_studio_interface/dev.env b/label_studio_interface/dev.env deleted file mode 100644 index 5b603e4d..00000000 --- a/label_studio_interface/dev.env +++ /dev/null @@ -1,3 +0,0 @@ -LABEL_STUDIO_ACCESS_TOKEN=REPLACE_WITH_YOUR_TOKEN -LABEL_STUDIO_PROJECT_ID=58903 -LABEL_STUDIO_ORGANIZATION_ID=9876 \ No newline at end of file diff --git a/local_database/DataDumper/dump.sh b/local_database/DataDumper/dump.sh index fb514157..9c07c0ca 100644 --- a/local_database/DataDumper/dump.sh +++ b/local_database/DataDumper/dump.sh @@ -6,7 +6,7 @@ DB_USER=${DUMP_USER:-"your_user"} DB_PORT=${DUMP_PORT:-"5432"} # Default to 5432 if not provided DB_PASSWORD=${DUMP_PASSWORD:-"your_password"} DB_NAME=${DUMP_NAME:-"your_database"} -DUMP_FILE=${DUMP_FILE:-"/dump/db_dump.sql"} +DUMP_FILE="/dump/db_dump.sql" # Export password for pg_dump export PGPASSWORD=$DB_PASSWORD # Dump the database diff --git a/local_database/DataDumper/restore.sh b/local_database/DataDumper/restore.sh index ff62349e..1efbe242 100644 --- a/local_database/DataDumper/restore.sh +++ b/local_database/DataDumper/restore.sh @@ -15,10 +15,14 @@ MAINT_CONNECTION_STRING="postgresql://$DB_USER:$DB_PASSWORD@$DB_HOST:$DB_PORT/$M echo "Checking if database $NEW_DB_NAME exists on $DB_HOST:$DB_PORT..." psql -d $MAINT_CONNECTION_STRING -tc "SELECT 1 FROM pg_database WHERE datname = '$NEW_DB_NAME';" | grep -q 1 && { echo "Database $NEW_DB_NAME exists. Dropping it..." - # Terminate all connections to the database - psql -d $MAINT_CONNECTION_STRING -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$NEW_DB_NAME';" + psql -d $MAINT_CONNECTION_STRING -tAc "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'your_database_name' AND pid <> pg_backend_pid();" # Drop the database psql -d $MAINT_CONNECTION_STRING -c "DROP DATABASE $NEW_DB_NAME;" + echo "Waiting for connections to terminate..." + while psql -d $MAINT_CONNECTION_STRING -tAc "SELECT 1 FROM pg_stat_activity WHERE datname = '$NEW_DB_NAME';" | grep -q 1; do + sleep 1 + echo "Still waiting..." + done } # Create the new database echo "Creating new database $NEW_DB_NAME on $DB_HOST:$DB_PORT..." diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py new file mode 100644 index 00000000..f88d2e9c --- /dev/null +++ b/start_mirrored_local_app.py @@ -0,0 +1,402 @@ +""" +Starts a local instance of the application utilizing a database +mirrored from production. + +Because this is used for testing only, the docker module is not included in +requirements.txt, and must be installed separately via +`pip install docker` +""" +import datetime +import os +import platform +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Annotated +import uvicorn + +import docker +from docker.errors import APIError, NotFound +from docker.models.containers import Container +from pydantic import BaseModel, model_validator, AfterValidator + +from apply_migrations import apply_migrations +from util.helper_functions import get_from_env + +def is_absolute_path(path: str) -> str: + if len(path) == 0: + raise ValueError("Path is required") + if path[0] != "/": + raise ValueError("Container path must be absolute") + return path + +class VolumeInfo(BaseModel): + host_path: str + container_path: Annotated[str, AfterValidator(is_absolute_path)] + + def build_volumes(self): + return { + get_absolute_path(self.host_path): { + "bind": self.container_path, + "mode": "rw" + } + } + +def wait_for_pg_to_be_ready(container: Container): + for i in range(30): + exit_code, output = container.exec_run("pg_isready") + print(output) + if exit_code == 0: + return + time.sleep(1) + raise Exception("Timed out waiting for postgres to be ready") + +def get_absolute_path(relative_path: str) -> str: + """ + Get absolute path, using the current file as the point of reference + """ + current_dir = Path(__file__).parent + absolute_path = (current_dir / relative_path).resolve() + return str(absolute_path) + + +class DockerfileInfo(BaseModel): + image_tag: str + dockerfile_directory: Optional[str] = None + + + +class HealthCheckInfo(BaseModel): + test: list[str] + interval: int + timeout: int + retries: int + start_period: int + + def build_healthcheck(self) -> dict: + multiplicative_factor = 1000000000 # Assume 1 second + return { + "test": self.test, + "interval": self.interval * multiplicative_factor, + "timeout": self.timeout * multiplicative_factor, + "retries": self.retries, + "start_period": self.start_period * multiplicative_factor + } + +class DockerInfo(BaseModel): + dockerfile_info: DockerfileInfo + volume_info: Optional[VolumeInfo] = None + name: str + ports: Optional[dict] = None + environment: Optional[dict] + command: Optional[str] = None + entrypoint: Optional[list[str]] = None + health_check_info: Optional[HealthCheckInfo] = None + +def run_command_checked(command: list[str] or str, shell=False): + result = subprocess.run( + command, + check=True, + capture_output=True, + text=True, + shell=shell + ) + return result + +def is_docker_running(): + try: + client = docker.from_env() + client.ping() + return True + except docker.errors.DockerException as e: + print(f"Docker is not running: {e}") + return False + +def wait_for_health(container, timeout=30): + start = time.time() + while time.time() - start < timeout: + container.reload() # Refresh container state + state = container.attrs.get("State") + print(state) + health = container.attrs.get("State", {}).get("Health", {}) + status = health.get("Status") + print(f"Health status: {status}") + if status == "healthy": + print("Postgres is healthy.") + return + elif status == "unhealthy": + raise Exception("Postgres container became unhealthy.") + time.sleep(1) + raise TimeoutError("Timed out waiting for Postgres to become healthy.") + +def start_docker_engine(): + system = platform.system() + + match system: + case "Windows": + # Use PowerShell to start Docker Desktop on Windows + subprocess.run([ + "powershell", "-Command", + "Start-Process 'Docker Desktop' -Verb RunAs" + ]) + case "Darwin": + # MacOS: Docker Desktop must be started manually or with open + subprocess.run(["open", "-a", "Docker"]) + case "Linux": + # Most Linux systems use systemctl to manage Docker + subprocess.run(["sudo", "systemctl", "start", "docker"]) + case _: + print(f"Unsupported OS: {system}") + sys.exit(1) + +class DockerManager: + def __init__(self): + self.client = docker.from_env() + self.network_name = "my_network" + self.network = self.start_network() + + def run_command(self, command: str, container_id: str): + exec_id = self.client.api.exec_create( + container_id, + cmd=command, + tty=True, + stdin=False + ) + output_stream = self.client.api.exec_start(exec_id=exec_id, stream=True) + for line in output_stream: + print(line.decode().rstrip()) + + def start_network(self): + try: + self.client.networks.create(self.network_name, driver="bridge") + except APIError as e: + # Assume already exists + print(e) + return self.client.networks.get("my_network") + + def stop_network(self): + self.client.networks.get("my_network").remove() + + def get_image(self, dockerfile_info: DockerfileInfo): + if dockerfile_info.dockerfile_directory: + # Build image from Dockerfile + self.client.images.build( + path=get_absolute_path(dockerfile_info.dockerfile_directory), + tag=dockerfile_info.image_tag + ) + else: + # Pull or use existing image + self.client.images.pull(dockerfile_info.image_tag) + + + def run_container( + self, + docker_info: DockerInfo, + ): + print(f"Running container {docker_info.name}") + try: + container = self.client.containers.get(docker_info.name) + if container.status == 'running': + print(f"Container '{docker_info.name}' is already running") + return container + print("Restarting container...") + container.start() + return container + except NotFound: + # Container does not exist; proceed to build/pull image and run + pass + + self.get_image(docker_info.dockerfile_info) + + container = self.client.containers.run( + image=docker_info.dockerfile_info.image_tag, + volumes=docker_info.volume_info.build_volumes() if docker_info.volume_info is not None else None, + command=docker_info.command, + entrypoint=docker_info.entrypoint, + detach=True, + name=docker_info.name, + ports=docker_info.ports, + network=self.network_name, + environment=docker_info.environment, + stdout=True, + stderr=True, + tty=True, + healthcheck=docker_info.health_check_info.build_healthcheck() if docker_info.health_check_info is not None else None + ) + return container + + def run_dockerfile(self, command: str): + dockerfile_path = os.path.dirname(os.path.abspath(__file__)) + tag = "court_scraper" + self.client.images.build( + path=dockerfile_path, + tag=tag + ) + + # Create data directory if doesn't exist + data_directory = dockerfile_path + "/data" + if not os.path.exists(data_directory): + os.makedirs(data_directory) + + volumes = { + data_directory: { + "bind": "/app/data", + "mode": "rw" + } + } + # Stop if + self.stop_dockerfile() + + container = self.run_container( + tag=tag, + command=command, + volumes=volumes, + name="court_scraper", + ports={"5000/tcp": 5000}, + environment={ + "MONGO_URI": "mongodb://mongo:27017/" + } + ) + + for log in container.logs(stream=True, follow=True): # 'follow=True' ensures logs stop when the container stops + print(log.decode().strip()) + + return container + + def stop_dockerfile(self): + try: + self.client.containers.get("court_scraper") + except NotFound: + return + self.client.containers.get("court_scraper").stop() + self.client.containers.get("court_scraper").remove() + +class TimestampChecker: + def __init__(self): + self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time() + + def load_last_run_time(self) -> Optional[datetime.datetime]: + # Check if file `last_run.txt` exists + # If it does, load the last run time + if os.path.exists("local_state/last_run.txt"): + with open("local_state/last_run.txt", "r") as f: + return datetime.datetime.strptime( + f.read(), + "%Y-%m-%d %H:%M:%S" + ) + return None + + def last_run_within_24_hours(self): + if self.last_run_time is None: + return False + return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1) + + def set_last_run_time(self): + # If directory `local_state` doesn't exist, create it + if not os.path.exists("local_state"): + os.makedirs("local_state") + + with open("local_state/last_run.txt", "w") as f: + f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + + +def main(): + docker_manager = DockerManager() + # Ensure docker is running, and start if not + if not is_docker_running(): + start_docker_engine() + + + # Ensure Dockerfile for database is running, and if not, start it + database_docker_info = DockerInfo( + dockerfile_info=DockerfileInfo( + image_tag="postgres:15", + ), + # volume_info=VolumeInfo( + # host_path="dbscripts", + # container_path="/var/lib/postgresql/data" + # ), + name="data_source_identification_db", + ports={ + "5432/tcp": 5432 + }, + environment={ + "POSTGRES_PASSWORD": "HanviliciousHamiltonHilltops", + "POSTGRES_USER": "test_source_collector_user", + "POSTGRES_DB": "source_collector_test_db" + }, + health_check_info=HealthCheckInfo( + test=["pg_isready", "-U", "test_source_collector_user", "-h", "127.0.0.1", "-p", "5432"], + interval=1, + timeout=3, + retries=30, + start_period=2 + ) + ) + container = docker_manager.run_container(database_docker_info) + wait_for_pg_to_be_ready(container) + + + # Start dockerfile for Datadumper + data_dumper_docker_info = DockerInfo( + dockerfile_info=DockerfileInfo( + image_tag="datadumper", + dockerfile_directory="local_database/DataDumper" + ), + volume_info=VolumeInfo( + host_path="./local_database/DataDumper/dump", + container_path="/dump" + ), + name="datadumper", + environment={ + "DUMP_HOST": get_from_env("DUMP_HOST"), + "DUMP_USER": get_from_env("DUMP_USER"), + "DUMP_PASSWORD": get_from_env("DUMP_PASSWORD"), + "DUMP_NAME": get_from_env("DUMP_DB_NAME"), + "DUMP_PORT": get_from_env("DUMP_PORT"), + "RESTORE_HOST": "data_source_identification_db", + "RESTORE_USER": "test_source_collector_user", + "RESTORE_PORT": "5432", + "RESTORE_DB_NAME": "source_collector_test_db", + "RESTORE_PASSWORD": "HanviliciousHamiltonHilltops", + }, + command="bash" + ) + + # If not last run within 24 hours, run dump operation in Datadumper + # Check cache if exists and + checker = TimestampChecker() + container = docker_manager.run_container(data_dumper_docker_info) + if checker.last_run_within_24_hours(): + print("Last run within 24 hours, skipping dump...") + else: + docker_manager.run_command( + '/usr/local/bin/dump.sh', + container.id + ) + docker_manager.run_command( + "/usr/local/bin/restore.sh", + container.id + ) + print("Stopping datadumper container") + container.stop() + checker.set_last_run_time() + + # Upgrade using alembic + apply_migrations() + + # Run `fastapi dev main.py` + uvicorn.run( + "api.main:app", + host="0.0.0.0", + port=8000 + ) + + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 5288496b..8968555c 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -175,6 +175,7 @@ def urls( self, batch_id: int, url_count: int, + collector_metadata: Optional[dict] = None, outcome: URLStatus = URLStatus.PENDING ) -> InsertURLsInfo: raw_urls = generate_test_urls(url_count) @@ -183,7 +184,8 @@ def urls( url_infos.append( URLInfo( url=url, - outcome=outcome + outcome=outcome, + collector_metadata=collector_metadata ) ) diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py index 75c46855..3839d0a6 100644 --- a/tests/test_automated/integration/tasks/test_url_html_task.py +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -7,9 +7,7 @@ from collector_db.enums import TaskType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator -from core.enums import BatchStatus from tests.helpers.DBDataCreator import DBDataCreator -from tests.helpers.assert_functions import assert_database_has_no_tasks from html_tag_collector.DataClassTags import ResponseHTMLInfo from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache diff --git a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py new file mode 100644 index 00000000..71093b2e --- /dev/null +++ b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py @@ -0,0 +1,143 @@ +from typing import Optional + +import pytest + +from collector_db.models import URL, URLOptionalDataSourceMetadata +from collector_manager.enums import CollectorType +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome +from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator +from helpers.DBDataCreator import DBDataCreator + + +def batch_and_url( + db_data_creator: DBDataCreator, + collector_type: CollectorType, + collector_metadata: Optional[dict] +): + batch_id = db_data_creator.batch(strategy=collector_type) + url_id = db_data_creator.urls( + batch_id=batch_id, + url_count=1, + collector_metadata=collector_metadata + ).url_mappings[0].url_id + return url_id + + +@pytest.mark.asyncio +async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): + + operator = URLMiscellaneousMetadataTaskOperator(adb_client=db_data_creator.adb_client) + + # Currently, task should not meet prerequisites + meets_prereqs = await operator.meets_task_prerequisites() + assert not meets_prereqs + + # Add one URL for each of the following batches, with appropriate collector metadata: + # ckan + ckan_url_id = batch_and_url( + db_data_creator, + CollectorType.CKAN, + collector_metadata={ + "submitted_name": "Test CKAN Name", + "description": "Test CKAN Description", + "record_format": ["CSV", "JSON"], + "data_portal_type": "Test Data Portal Type", + "supplying_entity": "Test Supplying Entity" + } + ) + # muckrock_simple + muckrock_simple_url_id = batch_and_url( + db_data_creator, + CollectorType.MUCKROCK_SIMPLE_SEARCH, + collector_metadata={ + 'title': 'Test Muckrock Simple Title', + } + ) + # muckrock_county + muckrock_county_url_id = batch_and_url( + db_data_creator, + CollectorType.MUCKROCK_COUNTY_SEARCH, + collector_metadata={ + 'title': 'Test Muckrock County Title', + } + ) + # muckrock_all + muckrock_all_url_id = batch_and_url( + db_data_creator, + CollectorType.MUCKROCK_ALL_SEARCH, + collector_metadata={ + 'title': 'Test Muckrock All Title', + } + ) + # auto_googler + auto_googler_url_id = batch_and_url( + db_data_creator, + CollectorType.AUTO_GOOGLER, + collector_metadata={ + "title" : "Test Auto Googler Title", + "snippet" : "Test Auto Googler Snippet" + } + ) + # common_crawler + common_crawler_url_id = batch_and_url( + db_data_creator, + CollectorType.COMMON_CRAWLER, + collector_metadata=None + ) + # example + + # Check that task now meets prerequisites + meets_prereqs = await operator.meets_task_prerequisites() + assert meets_prereqs + + # Run task + run_info = await operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS + + # Check that each URL has the expected name/description and optional metadata + expected_urls = { + common_crawler_url_id: (None, None), + auto_googler_url_id: ("Test Auto Googler Title", "Test Auto Googler Snippet"), + ckan_url_id: ("Test CKAN Name", "Test CKAN Description"), + muckrock_simple_url_id: ("Test Muckrock Simple Title", "Test Muckrock Simple Title"), + muckrock_county_url_id: ("Test Muckrock County Title", "Test Muckrock County Title"), + muckrock_all_url_id: ("Test Muckrock All Title", "Test Muckrock All Title"), + } + + urls: list[URL] = await db_data_creator.adb_client.get_all(URL) + assert len(urls) == len(expected_urls) + + seen_ids = set() + + for url in urls: + assert url.id not in seen_ids, f"Duplicate url.id found: {url.id}" + seen_ids.add(url.id) + + assert url.id in expected_urls, f"Unexpected url.id: {url.id}" + expected_name, expected_description = expected_urls[url.id] + assert url.name == expected_name, f"For url.id {url.id}, expected name {expected_name}, got {url.name}" + assert url.description == expected_description, f"For url.id {url.id}, expected description {expected_description}, got {url.description}" + + expected_urls = { + common_crawler_url_id: (None, None, None), + auto_googler_url_id: (None, None, None), + ckan_url_id: (["CSV", "JSON"], "Test Data Portal Type", "Test Supplying Entity"), + muckrock_simple_url_id: (None, None, None), + muckrock_county_url_id: (None, None, None), + muckrock_all_url_id: (None, None, None), + } + + metadatas: list[URLOptionalDataSourceMetadata] = await db_data_creator.adb_client.get_all(URLOptionalDataSourceMetadata) + seen_ids = set() + for metadata in metadatas: + assert metadata.url_id not in seen_ids, f"Duplicate url.id found: {metadata.url_id}" + seen_ids.add(metadata.url_id) + + assert metadata.url_id in expected_urls, f"Unexpected url.id: {metadata.url_id}" + expected_record_format, expected_data_portal_type, expected_supplying_entity = expected_urls[metadata.url_id] + assert metadata.record_formats == expected_record_format, f"For url.id {metadata.url_id}, expected record_format {expected_record_format}, got {metadata.url_id}" + assert metadata.data_portal_type == expected_data_portal_type, f"For url.id {metadata.url_id}, expected data_portal_type {expected_data_portal_type}, got {metadata.url_id}" + assert metadata.supplying_entity == expected_supplying_entity, f"For url.id {metadata.url_id}, expected supplying_entity {expected_supplying_entity}, got {metadata.url_id}" + + + From d8183a7f14b90045c0aa9937eefe85bc10227d98 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 25 Mar 2025 15:01:28 -0400 Subject: [PATCH 065/182] Correct bug in import addressing --- .../integration/tasks/test_url_miscellaneous_metadata_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py index 71093b2e..51f57da9 100644 --- a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py +++ b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py @@ -6,7 +6,7 @@ from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator -from helpers.DBDataCreator import DBDataCreator +from tests.helpers.DBDataCreator import DBDataCreator def batch_and_url( From 590d719e7413f1305743378de663e718002445e5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 26 Mar 2025 20:46:11 -0400 Subject: [PATCH 066/182] feat(app): Add additional information to Final Review Process --- collector_db/AsyncDatabaseClient.py | 100 ++++++++++++++++-- core/AsyncCore.py | 5 +- core/DTOs/FinalReviewApprovalInfo.py | 30 +++++- core/DTOs/GetNextURLForFinalReviewResponse.py | 20 +++- start_mirrored_local_app.py | 45 -------- tests/helpers/DBDataCreator.py | 27 +++++ tests/helpers/complex_test_data_functions.py | 10 +- .../integration/api/test_review.py | 26 ++++- .../collector_db/test_db_client.py | 75 ++++++++++++- 9 files changed, 267 insertions(+), 71 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 41685a4b..b8ba14a1 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,5 +1,5 @@ from functools import wraps -from typing import Optional, Type +from typing import Optional, Type, Any from fastapi import HTTPException from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update @@ -24,11 +24,13 @@ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse -from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ + FinalReviewOptionalMetadata from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo @@ -1014,6 +1016,7 @@ def count_subquery(model: Type[Base]): URL.outcome == URLStatus.PENDING.value ) + # The below relationships are joined directly to the URL single_join_relationships = [ URL.agency, URL.html_content, @@ -1021,12 +1024,14 @@ def count_subquery(model: Type[Base]): URL.auto_relevant_suggestion, URL.user_relevant_suggestions, URL.user_record_type_suggestions, + URL.optional_data_source_metadata ] options = [ joinedload(relationship) for relationship in single_join_relationships ] + # The below relationships are joined to entities that are joined to the URL double_join_relationships = [ (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestions, UserUrlAgencySuggestion.agency) @@ -1059,11 +1064,23 @@ def count_subquery(model: Type[Base]): html_content = result.html_content html_content_infos = [URLHTMLContentInfo(**html_info.__dict__) for html_info in html_content] + if result.optional_data_source_metadata is None: + optional_metadata = FinalReviewOptionalMetadata() + else: + optional_metadata = FinalReviewOptionalMetadata( + record_formats=result.optional_data_source_metadata.record_formats, + data_portal_type=result.optional_data_source_metadata.data_portal_type, + supplying_entity=result.optional_data_source_metadata.supplying_entity + ) + + # Return return GetNextURLForFinalReviewResponse( id=result.id, url=result.url, html_info=convert_to_response_html_info(html_content_infos), + name=result.name, + description=result.description, annotations=FinalReviewAnnotationInfo( relevant=DTOConverter.final_review_annotation_relevant_info( user_suggestions=result.user_relevant_suggestions, @@ -1078,33 +1095,63 @@ def count_subquery(model: Type[Base]): user_agency_suggestions=result.user_agency_suggestions, confirmed_agency=result.agency ) - ) + ), + optional_metadata=optional_metadata ) @session_manager async def approve_url( self, session: AsyncSession, - url_id: int, - record_type: RecordType, - relevant: bool, + approval_info: FinalReviewApprovalInfo, user_id: int, - agency_id: Optional[int] = None, ) -> None: # Get URL + def update_if_not_none( + model, + field, + value: Optional[Any], + required: bool=False + ): + if value is not None: + setattr(model, field, value) + return + if not required: + return + model_value = getattr(model, field, None) + if model_value is None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Must specify {field} if it does not already exist" + ) + query = ( Select(URL) - .where(URL.id == url_id) + .where(URL.id == approval_info.url_id) + .options( + joinedload(URL.optional_data_source_metadata), + ) ) url = await session.execute(query) url = url.scalars().first() - url.record_type = record_type.value - url.relevant = relevant + update_if_not_none( + url, + "record_type", + approval_info.record_type.value if approval_info.record_type is not None else None, + required=True + ) + update_if_not_none( + url, + "relevant", + approval_info.relevant, + required=True + ) + agency_id = approval_info.agency_id if url.agency_id is None and agency_id is None: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -1114,13 +1161,44 @@ async def approve_url( if url.agency_id != agency_id and agency_id is not None: url.agency_id = agency_id + + # If it does, do nothing url.outcome = URLStatus.VALIDATED.value + update_if_not_none(url, "name", approval_info.name, required=True) + update_if_not_none(url, "description", approval_info.description, required=True) + + optional_metadata = url.optional_data_source_metadata + if optional_metadata is None: + url.optional_data_source_metadata = URLOptionalDataSourceMetadata( + record_formats=approval_info.record_formats, + data_portal_type=approval_info.data_portal_type, + supplying_entity=approval_info.supplying_entity + ) + else: + update_if_not_none( + optional_metadata, + "record_formats", + approval_info.record_formats + ) + update_if_not_none( + optional_metadata, + "data_portal_type", + approval_info.data_portal_type + ) + update_if_not_none( + optional_metadata, + "supplying_entity", + approval_info.supplying_entity + ) + + # Add approving user + approving_user_url = ApprovingUserURL( user_id=user_id, - url_id=url_id + url_id=approval_info.url_id ) session.add(approving_user_url) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 5317023b..8b422d7d 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -226,10 +226,7 @@ async def approve_and_get_next_source_for_review( access_info: AccessInfo ): await self.adb_client.approve_url( - url_id=approval_info.url_id, - record_type=approval_info.record_type, - relevant=approval_info.relevant, - agency_id=approval_info.agency_id, + approval_info=approval_info, user_id=access_info.user_id ) return await self.get_next_source_for_review() diff --git a/core/DTOs/FinalReviewApprovalInfo.py b/core/DTOs/FinalReviewApprovalInfo.py index 210a07e3..f0cb3733 100644 --- a/core/DTOs/FinalReviewApprovalInfo.py +++ b/core/DTOs/FinalReviewApprovalInfo.py @@ -9,12 +9,12 @@ class FinalReviewApprovalInfo(BaseModel): url_id: int = Field( title="The id of the URL." ) - record_type: RecordType = Field( + record_type: Optional[RecordType] = Field( title="The final record type of the URL." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None ) - relevant: bool = Field( + relevant: Optional[bool] = Field( title="Final determination on whether the URL is relevant." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None @@ -24,3 +24,29 @@ class FinalReviewApprovalInfo(BaseModel): "If none, defers to an existing confirmed agency only if that exists.", default=None ) + name: Optional[str] = Field( + title="The name of the source. " + "If none, defers to an existing name only if that exists.", + default=None + ) + description: Optional[str] = Field( + title="The description of the source. " + "If none, defers to an existing description only if that exists.", + default=None + ) + record_formats: Optional[list[str]] = Field( + title="The record formats of the source. " + "If none, defers to an existing record formats only if that exists.", + default=None + ) + data_portal_type: Optional[str] = Field( + title="The data portal type of the source. " + "If none, defers to an existing data portal type only if that exists.", + default=None + ) + supplying_entity: Optional[str] = Field( + title="The supplying entity of the source. " + "If none, defers to an existing supplying entity only if that exists.", + default=None + ) + diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index df28040b..70eb1301 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -6,7 +6,6 @@ from core.enums import RecordType from html_tag_collector.DataClassTags import ResponseHTMLInfo -# Todo: Add descriptions class FinalReviewAnnotationRelevantUsersInfo(BaseModel): relevant: int = Field(title="Number of users who marked the URL as relevant") @@ -54,13 +53,32 @@ class FinalReviewAnnotationInfo(BaseModel): title="User and auto annotations for agency", ) +class FinalReviewOptionalMetadata(BaseModel): + record_formats: Optional[list[str]] = Field( + title="The record formats of the source", + default=None + ) + data_portal_type: Optional[str] = Field( + title="The data portal type of the source", + default=None + ) + supplying_entity: Optional[str] = Field( + title="The supplying entity of the source", + default=None + ) + class GetNextURLForFinalReviewResponse(BaseModel): id: int = Field(title="The id of the URL") url: str = Field(title="The URL") + name: Optional[str] = Field(title="The name of the source") + description: Optional[str] = Field(title="The description of the source") html_info: ResponseHTMLInfo = Field(title="The HTML content of the URL") annotations: FinalReviewAnnotationInfo = Field( title="The annotations for the URL, from both users and the auto-labeler", ) + optional_metadata: FinalReviewOptionalMetadata = Field( + title="Optional metadata for the source", + ) class GetNextURLForFinalReviewOuterResponse(BaseModel): next_source: Optional[GetNextURLForFinalReviewResponse] = Field( diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index f88d2e9c..48859adc 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -226,51 +226,6 @@ def run_container( ) return container - def run_dockerfile(self, command: str): - dockerfile_path = os.path.dirname(os.path.abspath(__file__)) - tag = "court_scraper" - self.client.images.build( - path=dockerfile_path, - tag=tag - ) - - # Create data directory if doesn't exist - data_directory = dockerfile_path + "/data" - if not os.path.exists(data_directory): - os.makedirs(data_directory) - - volumes = { - data_directory: { - "bind": "/app/data", - "mode": "rw" - } - } - # Stop if - self.stop_dockerfile() - - container = self.run_container( - tag=tag, - command=command, - volumes=volumes, - name="court_scraper", - ports={"5000/tcp": 5000}, - environment={ - "MONGO_URI": "mongodb://mongo:27017/" - } - ) - - for log in container.logs(stream=True, follow=True): # 'follow=True' ensures logs stop when the container stops - print(log.decode().strip()) - - return container - - def stop_dockerfile(self): - try: - self.client.containers.get("court_scraper") - except NotFound: - return - self.client.containers.get("court_scraper").stop() - self.client.containers.get("court_scraper").remove() class TimestampChecker: def __init__(self): diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 8968555c..31afb7c4 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -15,6 +15,7 @@ from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType from collector_manager.enums import CollectorType, URLStatus from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO from core.enums import BatchStatus, SuggestionType, RecordType from tests.helpers.simple_test_data_functions import generate_test_urls @@ -194,6 +195,32 @@ def urls( batch_id=batch_id, ) + async def url_miscellaneous_metadata( + self, + url_id: int, + name: str = "Test Name", + description: str = "Test Description", + record_formats: Optional[list[str]] = None, + data_portal_type: Optional[str] = "Test Data Portal Type", + supplying_entity: Optional[str] = "Test Supplying Entity" + ): + if record_formats is None: + record_formats = ["Test Record Format", "Test Record Format 2"] + + tdo = URLMiscellaneousMetadataTDO( + url_id=url_id, + collector_metadata={}, + collector_type=CollectorType.EXAMPLE, + record_formats=record_formats, + name=name, + description=description, + data_portal_type=data_portal_type, + supplying_entity=supplying_entity + ) + + await self.adb_client.add_miscellaneous_metadata([tdo]) + + def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): """ Create duplicates for all given url ids, and associate them diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 44415090..4aa9a86f 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -6,7 +6,8 @@ async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, annotation_count: int, - include_user_annotations: bool = True + include_user_annotations: bool = True, + include_miscellaneous_metadata: bool = True ): """ Sets up the database to test the final_review functions @@ -15,7 +16,12 @@ async def setup_for_get_next_url_for_final_review( """ batch_id = db_data_creator.batch() - url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] + url_mapping = db_data_creator.urls( + batch_id=batch_id, + url_count=1 + ).url_mappings[0] + if include_miscellaneous_metadata: + await db_data_creator.url_miscellaneous_metadata(url_id=url_mapping.url_id) await db_data_creator.html_data([url_mapping.url_id]) async def add_agency_suggestion(count: int): diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index 99af93e9..715c6926 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -1,6 +1,6 @@ import pytest -from collector_db.models import URL +from collector_db.models import URL, URLOptionalDataSourceMetadata from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse @@ -27,6 +27,15 @@ async def test_review_next_source(api_test_helper): result = outer_result.next_source + assert result.name == "Test Name" + assert result.description == "Test Description" + + optional_metadata = result.optional_metadata + + assert optional_metadata.data_portal_type == "Test Data Portal Type" + assert optional_metadata.supplying_entity == "Test Supplying Entity" + assert optional_metadata.record_formats == ["Test Record Format", "Test Record Format 2"] + assert result.url == url_mapping.url html_info = result.html_info assert html_info.description == "test description" @@ -80,7 +89,12 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): url_id=url_mapping.url_id, record_type=RecordType.ARREST_RECORDS, relevant=True, - agency_id=agency_id + agency_id=agency_id, + name="New Test Name", + description="New Test Description", + record_formats=["New Test Record Format", "New Test Record Format 2"], + data_portal_type="New Test Data Portal Type", + supplying_entity="New Test Supplying Entity" ) ) @@ -96,5 +110,13 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert url.record_type == RecordType.ARREST_RECORDS.value assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value + assert url.name == "New Test Name" + assert url.description == "New Test Description" + + optional_metadata = await adb_client.get_all(URLOptionalDataSourceMetadata) + assert len(optional_metadata) == 1 + assert optional_metadata[0].data_portal_type == "New Test Data Portal Type" + assert optional_metadata[0].supplying_entity == "New Test Supplying Entity" + assert optional_metadata[0].record_formats == ["New Test Record Format", "New Test Record Format 2"] diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index b8ac56f1..cd2527b6 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -1,6 +1,8 @@ from datetime import datetime, timedelta import pytest +from _pytest.outcomes import fail +from fastapi import HTTPException from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo @@ -9,8 +11,9 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import URL, ApprovingUserURL +from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata from collector_manager.enums import URLStatus +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review @@ -329,9 +332,11 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): adb_client = db_data_creator.adb_client # Approve URL. Only URL should be affected. No other properties should be changed. await adb_client.approve_url( - url_mapping.url_id, - record_type=RecordType.ARREST_RECORDS, - relevant=True, + approval_info=FinalReviewApprovalInfo( + url_id=url_mapping.url_id, + record_type=RecordType.ARREST_RECORDS, + relevant=True, + ), user_id=1 ) @@ -344,9 +349,71 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert url.record_type == RecordType.ARREST_RECORDS.value assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value + assert url.name == "Test Name" + assert url.description == "Test Description" approving_user_urls = await adb_client.get_all(ApprovingUserURL) assert len(approving_user_urls) == 1 assert approving_user_urls[0].user_id == 1 assert approving_user_urls[0].url_id == url_mapping.url_id + optional_metadata = await adb_client.get_all(URLOptionalDataSourceMetadata) + assert len(optional_metadata) == 1 + assert optional_metadata[0].url_id == url_mapping.url_id + assert optional_metadata[0].record_formats == ["Test Record Format", "Test Record Format 2"] + assert optional_metadata[0].data_portal_type == "Test Data Portal Type" + assert optional_metadata[0].supplying_entity == "Test Supplying Entity" + +@pytest.mark.asyncio +async def test_approval_url_error(db_data_creator: DBDataCreator): + url_mapping = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True, + include_miscellaneous_metadata=False + ) + + # Set all required descriptors to none and receive an error + adb_client = db_data_creator.adb_client + with pytest.raises(HTTPException) as e: + await adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_mapping.url_id, + ), + user_id=1 + ) + assert e.value.status_code == 422 + + # Create kwarg dictionary with all required approval info fields + kwarg_dict = { + "record_type": RecordType.ARREST_RECORDS, + "agency_id": await db_data_creator.agency(), + "name": "Test Name", + "description": "Test Description", + } + # For each keyword, create a copy of the kwargs and set that one to none + # Confirm it produces the correct error + for kwarg in kwarg_dict: + kwarg_copy = kwarg_dict.copy() + kwarg_copy[kwarg] = None + with pytest.raises(HTTPException) as e: + await adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_mapping.url_id, + relevant=True, + **kwarg_copy + ), + user_id=1 + ) + pytest.fail(f"Expected error for kwarg {kwarg}") + + # Test that if all kwargs are set, no error is raised + await adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_mapping.url_id, + relevant=True, + **kwarg_dict + ), + user_id=1 + ) + From 0552310fb88b406720d0bb24da14024dcc203c05 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 28 Mar 2025 09:09:24 -0400 Subject: [PATCH 067/182] DRAFT --- ...ef_remove_agency_id_parameter_from_urls.py | 56 ++++++++++++++++++ collector_db/AsyncDatabaseClient.py | 57 +++++++++++++------ collector_db/DTOConverter.py | 43 ++++++++------ collector_db/models.py | 22 ++++++- core/DTOs/FinalReviewApprovalInfo.py | 4 +- core/DTOs/GetNextURLForFinalReviewResponse.py | 2 +- tests/helpers/DBDataCreator.py | 2 +- .../integration/api/test_review.py | 24 ++++++-- .../collector_db/test_db_client.py | 2 +- 9 files changed, 164 insertions(+), 48 deletions(-) create mode 100644 alembic/versions/2025_03_28_0807-5ea47dacd0ef_remove_agency_id_parameter_from_urls.py diff --git a/alembic/versions/2025_03_28_0807-5ea47dacd0ef_remove_agency_id_parameter_from_urls.py b/alembic/versions/2025_03_28_0807-5ea47dacd0ef_remove_agency_id_parameter_from_urls.py new file mode 100644 index 00000000..bc3f9bd3 --- /dev/null +++ b/alembic/versions/2025_03_28_0807-5ea47dacd0ef_remove_agency_id_parameter_from_urls.py @@ -0,0 +1,56 @@ +"""Remove agency_id parameter from URLs + +Revision ID: 5ea47dacd0ef +Revises: 6eb8084e2f48 +Create Date: 2025-03-28 08:07:24.442764 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5ea47dacd0ef' +down_revision: Union[str, None] = '6eb8084e2f48' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Remove agency ID column from URLs + op.drop_column( + 'urls', + 'agency_id' + ) + + op.create_table( + 'confirmed_url_agency', + sa.Column('id', sa.Integer(), primary_key=True), + sa.Column('url_id', sa.Integer(), sa.ForeignKey('urls.id', ondelete='CASCADE'), nullable=False), + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey('agencies.agency_id', ondelete='CASCADE'), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()')), + sa.Column('updated_at', sa.TIMESTAMP(), nullable=False, server_default=sa.text('now()'), onupdate=sa.text('now()')), + sa.UniqueConstraint( + 'url_id', 'agency_id', + name="uq_confirmed_url_agency" + ) + ) + + +def downgrade() -> None: + op.add_column( + 'urls', + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey('agencies.agency_id', ondelete='NO ACTION'), + nullable=True + ) + ) + + op.drop_table('confirmed_url_agency') \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index b8ba14a1..2d6f2efe 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2,7 +2,7 @@ from typing import Optional, Type, Any from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute from sqlalchemy.sql.functions import coalesce @@ -22,7 +22,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata + UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus, CollectorType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo @@ -775,10 +775,9 @@ async def get_next_url_agency_for_annotation( # Select statement statement = ( select(URL.id, URL.url) - # Must not have a confirmed agency identifier. + # Must not have confirmed agencies .where( and_( - URL.agency_id.is_(None), URL.outcome == URLStatus.PENDING.value ) ) @@ -803,6 +802,15 @@ async def get_next_url_agency_for_annotation( correlate(URL) ) ) + # Must not have confirmed agencies + .join(ConfirmedURLAgency, isouter=True) + .where( + ~exists( + select(ConfirmedURLAgency). + where(ConfirmedURLAgency.url_id == URL.id). + correlate(URL) + ) + ) ).limit(1) raw_result = await session.execute(statement) results = raw_result.all() @@ -885,9 +893,11 @@ async def add_confirmed_agency_url_links( suggestions: list[URLAgencySuggestionInfo] ): for suggestion in suggestions: - url = await session.execute(select(URL).where(URL.id == suggestion.url_id)) - url = url.scalar_one() - url.agency_id = suggestion.pdap_agency_id + confirmed_agency = ConfirmedURLAgency( + url_id=suggestion.url_id, + agency_id=suggestion.pdap_agency_id + ) + session.add(confirmed_agency) @session_manager async def add_agency_auto_suggestions( @@ -1018,13 +1028,12 @@ def count_subquery(model: Type[Base]): # The below relationships are joined directly to the URL single_join_relationships = [ - URL.agency, URL.html_content, URL.auto_record_type_suggestion, URL.auto_relevant_suggestion, URL.user_relevant_suggestions, URL.user_record_type_suggestions, - URL.optional_data_source_metadata + URL.optional_data_source_metadata, ] options = [ @@ -1034,7 +1043,8 @@ def count_subquery(model: Type[Base]): # The below relationships are joined to entities that are joined to the URL double_join_relationships = [ (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), - (URL.user_agency_suggestions, UserUrlAgencySuggestion.agency) + (URL.user_agency_suggestions, UserUrlAgencySuggestion.agency), + (URL.confirmed_agencies, ConfirmedURLAgency.agency) ] for primary, secondary in double_join_relationships: options.append(joinedload(primary).joinedload(secondary)) @@ -1093,7 +1103,7 @@ def count_subquery(model: Type[Base]): agency=DTOConverter.final_review_annotation_agency_info( automated_agency_suggestions=result.automated_agency_suggestions, user_agency_suggestions=result.user_agency_suggestions, - confirmed_agency=result.agency + confirmed_agencies=result.confirmed_agencies ) ), optional_metadata=optional_metadata @@ -1132,6 +1142,7 @@ def update_if_not_none( .where(URL.id == approval_info.url_id) .options( joinedload(URL.optional_data_source_metadata), + joinedload(URL.confirmed_agencies), ) ) @@ -1151,17 +1162,29 @@ def update_if_not_none( required=True ) - agency_id = approval_info.agency_id - if url.agency_id is None and agency_id is None: + # Get existing agency ids + existing_agency_ids = [agency.agency_id for agency in url.confirmed_agencies] + new_agency_ids = approval_info.agency_ids + if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Must specify agency_id if URL does not already have a confirmed agency" ) - # If a different agency exists as confirmed, overwrite it - if url.agency_id != agency_id and agency_id is not None: - url.agency_id = agency_id - + # Get any existing agency ids that are not in the new agency ids + for existing_agency in url.confirmed_agencies: + if existing_agency.id not in new_agency_ids: + # If the existing agency id is not in the new agency ids, delete it + await session.delete(existing_agency) + # Add any new agency ids that are not in the existing agency ids + for new_agency_id in new_agency_ids: + if new_agency_id not in existing_agency_ids: + # If the new agency id is not in the existing agency ids, add it + confirmed_url_agency = ConfirmedURLAgency( + url_id=approval_info.url_id, + agency_id=new_agency_id + ) + session.add(confirmed_url_agency) # If it does, do nothing diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index 6bf9a967..0d2856cf 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -4,7 +4,8 @@ from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent, URL, Agency, \ - AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion + AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion, \ + ConfirmedURLAgency from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ @@ -130,27 +131,35 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( # Return sorted return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) + @staticmethod + def confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies: list[ConfirmedURLAgency] + ) -> list[GetNextURLForAgencyAgencyInfo]: + results = [] + for confirmed_agency in confirmed_agencies: + agency = confirmed_agency.agency + agency_info = GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.CONFIRMED, + pdap_agency_id=agency.agency_id, + agency_name=agency.name, + state=agency.state, + county=agency.county, + locality=agency.locality + ) + results.append(agency_info) + return results + @staticmethod def final_review_annotation_agency_info( automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agency: Optional[Agency], + confirmed_agencies: list[ConfirmedURLAgency], user_agency_suggestions: list[UserUrlAgencySuggestion] ): - if confirmed_agency is not None: - confirmed_agency_info = GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=confirmed_agency.agency_id, - agency_name=confirmed_agency.name, - state=confirmed_agency.state, - county=confirmed_agency.county, - locality=confirmed_agency.locality - ) - return FinalReviewAnnotationAgencyInfo( - confirmed=confirmed_agency_info, - users=None, - auto=None - ) + + confirmed_agency_info = DTOConverter.confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies + ) agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( automated_agency_suggestions @@ -161,7 +170,7 @@ def final_review_annotation_agency_info( ) return FinalReviewAnnotationAgencyInfo( - confirmed=None, + confirmed=confirmed_agency_info, users=agency_user_info, auto=agency_auto_info ) diff --git a/collector_db/models.py b/collector_db/models.py index 4a61be88..55b75af2 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -101,7 +101,6 @@ class URL(Base): ), nullable=False ) - agency_id = Column(Integer, ForeignKey('agencies.agency_id', name='fk_url_agency_id')) record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) relevant = Column(Boolean, nullable=True) created_at = get_created_at_column() @@ -117,7 +116,6 @@ class URL(Base): secondary="link_task_urls", back_populates="urls", ) - agency = relationship("Agency", uselist=False, back_populates="urls") automated_agency_suggestions = relationship( "AutomatedUrlAgencySuggestion", back_populates="url") user_agency_suggestions = relationship( @@ -134,6 +132,10 @@ class URL(Base): "ApprovingUserURL", back_populates="url") optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") + confirmed_agencies = relationship( + "ConfirmedURLAgency", + ) + class URLOptionalDataSourceMetadata(Base): __tablename__ = 'url_optional_data_source_metadata' @@ -328,9 +330,23 @@ class Agency(Base): updated_at = get_updated_at_column() # Relationships - urls = relationship("URL", back_populates="agency") automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") + confirmed_urls = relationship("ConfirmedURLAgency", back_populates="agency") + +class ConfirmedURLAgency(Base): + __tablename__ = "confirmed_url_agency" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + agency_id = Column(Integer, ForeignKey("agencies.agency_id"), nullable=False) + + url = relationship("URL", back_populates="confirmed_agencies") + agency = relationship("Agency", back_populates="confirmed_urls") + + __table_args__ = ( + UniqueConstraint("url_id", "agency_id", name="uq_confirmed_url_agency"), + ) class AutomatedUrlAgencySuggestion(Base): __tablename__ = "automated_url_agency_suggestions" diff --git a/core/DTOs/FinalReviewApprovalInfo.py b/core/DTOs/FinalReviewApprovalInfo.py index f0cb3733..e24c3c75 100644 --- a/core/DTOs/FinalReviewApprovalInfo.py +++ b/core/DTOs/FinalReviewApprovalInfo.py @@ -19,8 +19,8 @@ class FinalReviewApprovalInfo(BaseModel): "If none, defers to the existing value from the auto-labeler only if it exists.", default=None ) - agency_id: Optional[int] = Field( - title="The final confirmed agency for the URL. " + agency_ids: Optional[list[int]] = Field( + title="The final confirmed agencies for the URL. " "If none, defers to an existing confirmed agency only if that exists.", default=None ) diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index 70eb1301..422c38ab 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -33,7 +33,7 @@ class FinalReviewAnnotationAgencyAutoInfo(BaseModel): ) class FinalReviewAnnotationAgencyInfo(BaseModel): - confirmed: Optional[GetNextURLForAgencyAgencyInfo] = Field( + confirmed: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( title="The confirmed agency for the URL", ) auto: Optional[FinalReviewAnnotationAgencyAutoInfo] = Field( diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 31afb7c4..9f9719a7 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -298,7 +298,7 @@ async def agency_auto_suggestions( async def agency_confirmed_suggestion( self, url_id: int - ): + ) -> int: """ Creates a confirmed agency suggestion and returns the auto-generated pdap_agency_id diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index 715c6926..b43a3ae8 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -1,6 +1,6 @@ import pytest -from collector_db.models import URL, URLOptionalDataSourceMetadata +from collector_db.models import URL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse @@ -22,6 +22,7 @@ async def test_review_next_source(api_test_helper): url_id=url_mapping.url_id, count=3 ) + confirmed_agency_id = await ath.db_data_creator.agency_confirmed_suggestion(url_id=url_mapping.url_id) outer_result = await ath.request_validator.review_next_source() @@ -68,6 +69,12 @@ async def test_review_next_source(api_test_helper): for i in range(3): assert user_agency_suggestions_as_list[i].count == 3 - i + # Check confirmed agencies exist + confirmed_agencies = agency_info.confirmed + assert len(confirmed_agencies) == 1 + confirmed_agency = confirmed_agencies[0] + assert confirmed_agency.pdap_agency_id == confirmed_agency_id + @pytest.mark.asyncio async def test_approve_and_get_next_source_for_review(api_test_helper): ath = api_test_helper @@ -80,16 +87,16 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): ) # Add confirmed agency - agency_id = await db_data_creator.agency_confirmed_suggestion( - url_id=url_mapping.url_id - ) + confirmed_agency = await db_data_creator.confirmed_suggestions([url_mapping.url_id]) + + agency_ids = [await db_data_creator.agency() for _ in range(3)] result: GetNextURLForFinalReviewOuterResponse = await ath.request_validator.approve_and_get_next_source_for_review( approval_info=FinalReviewApprovalInfo( url_id=url_mapping.url_id, record_type=RecordType.ARREST_RECORDS, relevant=True, - agency_id=agency_id, + agency_ids=agency_ids, name="New Test Name", description="New Test Description", record_formats=["New Test Record Format", "New Test Record Format 2"], @@ -106,7 +113,6 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.agency_id == agency_id assert url.record_type == RecordType.ARREST_RECORDS.value assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value @@ -119,4 +125,10 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert optional_metadata[0].supplying_entity == "New Test Supplying Entity" assert optional_metadata[0].record_formats == ["New Test Record Format", "New Test Record Format 2"] + # Get agencies + agencies = await adb_client.get_all(ConfirmedURLAgency) + assert len(agencies) == 3 + for agency in agencies: + assert agency.agency_id in agency_ids + diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index cd2527b6..9eb49570 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -286,7 +286,7 @@ async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBD annotations = result.annotations agency = annotations.agency - assert agency.confirmed is None + assert agency.confirmed == [] assert agency.auto.unknown is True assert agency.auto.suggestions == [] From cc46ef74ae88f1d6deded7fc53f0696726564a69 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 28 Mar 2025 17:31:46 -0400 Subject: [PATCH 068/182] feat(app): Allow multiple confirmed agencies for URL --- collector_db/AsyncDatabaseClient.py | 21 ++++++++++++------- collector_db/StatementComposer.py | 13 +++++++----- .../collector_db/test_db_client.py | 10 ++++++--- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 2d6f2efe..bfd446b1 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -728,7 +728,9 @@ async def has_urls_without_agency_suggestions( statement = ( select( URL.id - ).where(URL.agency_id == None)) + ) + ) + statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) raw_result = await session.execute(statement) result = raw_result.all() @@ -936,7 +938,7 @@ async def add_agency_manual_suggestion( @session_manager async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: - statement = select(URL).where(URL.agency_id != None) + statement = select(URL).where(exists().where(ConfirmedURLAgency.url_id == URL.id)) results = await session.execute(statement) return list(results.scalars().all()) @@ -1163,8 +1165,9 @@ def update_if_not_none( ) # Get existing agency ids - existing_agency_ids = [agency.agency_id for agency in url.confirmed_agencies] - new_agency_ids = approval_info.agency_ids + existing_agencies = url.confirmed_agencies or [] + existing_agency_ids = [agency.agency_id for agency in existing_agencies] + new_agency_ids = approval_info.agency_ids or [] if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -1172,10 +1175,12 @@ def update_if_not_none( ) # Get any existing agency ids that are not in the new agency ids - for existing_agency in url.confirmed_agencies: - if existing_agency.id not in new_agency_ids: - # If the existing agency id is not in the new agency ids, delete it - await session.delete(existing_agency) + # If new agency ids are specified, overwrite existing + if len(new_agency_ids) != 0: + for existing_agency in existing_agencies: + if existing_agency.id not in new_agency_ids: + # If the existing agency id is not in the new agency ids, delete it + await session.delete(existing_agency) # Add any new agency ids that are not in the existing agency ids for new_agency_id in new_agency_ids: if new_agency_id not in existing_agency_ids: diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 88da61f3..42dcf84c 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -4,7 +4,8 @@ from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus -from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch +from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch, \ + ConfirmedURLAgency from collector_manager.enums import URLStatus, CollectorType @@ -54,10 +55,12 @@ def exclude_urls_with_agency_suggestions( # Aliases for clarity AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - statement = (statement - .where(~exists().where(AutomatedSuggestion.url_id == URL.id)) # Exclude if automated suggestions exist - ) # Exclude if confirmed agencies exist - + statement = statement.where( + ~exists().where(AutomatedSuggestion.url_id == URL.id) + ) # Exclude if automated suggestions exist + statement = statement.where( + ~exists().where(ConfirmedURLAgency.url_id == URL.id) + ) return statement diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 9eb49570..fe6fad2f 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -11,7 +11,7 @@ from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata +from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType @@ -345,13 +345,17 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.agency_id == agency_id assert url.record_type == RecordType.ARREST_RECORDS.value assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value assert url.name == "Test Name" assert url.description == "Test Description" + confirmed_agency = await adb_client.get_all(ConfirmedURLAgency) + assert len(confirmed_agency) == 1 + assert confirmed_agency[0].url_id == url_mapping.url_id + assert confirmed_agency[0].agency_id == agency_id + approving_user_urls = await adb_client.get_all(ApprovingUserURL) assert len(approving_user_urls) == 1 assert approving_user_urls[0].user_id == 1 @@ -387,7 +391,7 @@ async def test_approval_url_error(db_data_creator: DBDataCreator): # Create kwarg dictionary with all required approval info fields kwarg_dict = { "record_type": RecordType.ARREST_RECORDS, - "agency_id": await db_data_creator.agency(), + "agency_ids": [await db_data_creator.agency()], "name": "Test Name", "description": "Test Description", } From 852a3760f665e821f3432b343b623248fe9e30a6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 28 Mar 2025 20:34:35 -0400 Subject: [PATCH 069/182] feat(app): `/review/approve-source` new agencies added to db Previously, agency ids not already in the database were rejected. Now these are permitted with a placeholder name --- collector_db/AsyncDatabaseClient.py | 16 ++++++++++++++++ collector_db/constants.py | 3 +++ .../integration/api/test_review.py | 19 +++++++++++++++---- 3 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 collector_db/constants.py diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index bfd446b1..06ad81bd 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -17,6 +17,7 @@ from collector_db.DTOs.URLMapping import URLMapping from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.StatementComposer import StatementComposer +from collector_db.constants import PLACEHOLDER_AGENCY_NAME from collector_db.enums import URLMetadataAttributeType, TaskType from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ @@ -1184,6 +1185,21 @@ def update_if_not_none( # Add any new agency ids that are not in the existing agency ids for new_agency_id in new_agency_ids: if new_agency_id not in existing_agency_ids: + # Check if the new agency exists in the database + query = ( + select(Agency) + .where(Agency.agency_id == new_agency_id) + ) + existing_agency = await session.execute(query) + existing_agency = existing_agency.scalars().first() + if existing_agency is None: + # If not, create it + agency = Agency( + agency_id=new_agency_id, + name=PLACEHOLDER_AGENCY_NAME, + ) + session.add(agency) + # If the new agency id is not in the existing agency ids, add it confirmed_url_agency = ConfirmedURLAgency( url_id=approval_info.url_id, diff --git a/collector_db/constants.py b/collector_db/constants.py new file mode 100644 index 00000000..294c8fd9 --- /dev/null +++ b/collector_db/constants.py @@ -0,0 +1,3 @@ + + +PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index b43a3ae8..009a7638 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -1,6 +1,7 @@ import pytest -from collector_db.models import URL, URLOptionalDataSourceMetadata, ConfirmedURLAgency +from collector_db.constants import PLACEHOLDER_AGENCY_NAME +from collector_db.models import URL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Agency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse @@ -89,7 +90,11 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): # Add confirmed agency confirmed_agency = await db_data_creator.confirmed_suggestions([url_mapping.url_id]) + # Additionally, include an agency not yet included in the database + additional_agency = 999999 + agency_ids = [await db_data_creator.agency() for _ in range(3)] + agency_ids.append(additional_agency) result: GetNextURLForFinalReviewOuterResponse = await ath.request_validator.approve_and_get_next_source_for_review( approval_info=FinalReviewApprovalInfo( @@ -126,9 +131,15 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert optional_metadata[0].record_formats == ["New Test Record Format", "New Test Record Format 2"] # Get agencies - agencies = await adb_client.get_all(ConfirmedURLAgency) - assert len(agencies) == 3 - for agency in agencies: + confirmed_agencies = await adb_client.get_all(ConfirmedURLAgency) + assert len(confirmed_agencies) == 4 + for agency in confirmed_agencies: assert agency.agency_id in agency_ids + # Check that created agency has placeholder + agencies = await adb_client.get_all(Agency) + for agency in agencies: + if agency.agency_id == additional_agency: + assert agency.name == PLACEHOLDER_AGENCY_NAME + From d99d189de050092740df64f8f7d6d534a276111d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 29 Mar 2025 10:44:06 -0400 Subject: [PATCH 070/182] fix(database): Fix bug causing validated URLs to show up for some annotations --- collector_db/AsyncDatabaseClient.py | 1 + collector_db/StatementComposer.py | 4 +- .../collector_db/test_db_client.py | 68 ++++++++++++++++++- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 06ad81bd..05006228 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -126,6 +126,7 @@ async def get_next_url_for_user_annotation( select( URL, ) + .where(URL.outcome == URLStatus.PENDING.value) .where(exists(select(URLHTMLContent).where(URLHTMLContent.url_id == URL.id))) # URL must not have metadata annotation by this user .where( diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 42dcf84c..c80b83e5 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -55,9 +55,11 @@ def exclude_urls_with_agency_suggestions( # Aliases for clarity AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) + # Exclude if automated suggestions exist statement = statement.where( ~exists().where(AutomatedSuggestion.url_id == URL.id) - ) # Exclude if automated suggestions exist + ) + # Exclude if confirmed agencies exist statement = statement.where( ~exists().where(ConfirmedURLAgency.url_id == URL.id) ) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index fe6fad2f..a2cb25a9 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -6,12 +6,14 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo +from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency +from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, \ + UserRelevantSuggestion from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType @@ -421,3 +423,67 @@ async def test_approval_url_error(db_data_creator: DBDataCreator): user_id=1 ) +@pytest.mark.asyncio +async def test_get_next_url_for_user_relevance_annotation_pending( + db_data_creator: DBDataCreator +): + + batch_id = db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = db_data_creator.urls( + batch_id=batch_id, + url_count=1, + outcome=URLStatus.PENDING + ) + + url_1 = iui.url_mappings[0] + + # Add `Relevancy` attribute with value `True` + await db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add HTML data + await db_data_creator.html_data([url_1.url_id]) + + adb_client = db_data_creator.adb_client + url = await adb_client.get_next_url_for_relevance_annotation( + user_id=1 + ) + assert url is not None + +@pytest.mark.asyncio +async def test_get_next_url_for_user_relevance_annotation_validated( + db_data_creator: DBDataCreator +): + """ + A validated URL should not turn up in get_next_url_for_user_annotation + """ + + batch_id = db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = db_data_creator.urls( + batch_id=batch_id, + url_count=1, + outcome=URLStatus.VALIDATED + ) + + url_1 = iui.url_mappings[0] + + # Add `Relevancy` attribute with value `True` + await db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add HTML data + await db_data_creator.html_data([url_1.url_id]) + + adb_client = db_data_creator.adb_client + url = await adb_client.get_next_url_for_relevance_annotation( + user_id=1 + ) + assert url is None \ No newline at end of file From a3dedcd0574746a63ce1241e3527189078a52270 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 31 Mar 2025 19:52:24 -0400 Subject: [PATCH 071/182] DRAFT --- ..._add_data_source_id_column_to_url_table.py | 31 +++++++++++++++++++ collector_db/models.py | 1 + .../task_data_objects/SubmitApprovedURLTDO.py | 8 ++++- core/classes/SubmitApprovedURLTaskOperator.py | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py diff --git a/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py b/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py new file mode 100644 index 00000000..8e15dbf2 --- /dev/null +++ b/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py @@ -0,0 +1,31 @@ +"""Add data source ID column to URL table + +Revision ID: 33a546c93441 +Revises: 5ea47dacd0ef +Create Date: 2025-03-29 17:16:11.863064 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '33a546c93441' +down_revision: Union[str, None] = '5ea47dacd0ef' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + 'url', + sa.Column('data_source_id', sa.Integer(), nullable=True) + ) + # Add unique constraint to data_source_id column + op.create_unique_constraint('uq_data_source_id', 'url', ['data_source_id']) + + +def downgrade() -> None: + op.drop_column('url', 'data_source_id') diff --git a/collector_db/models.py b/collector_db/models.py index 55b75af2..4a82e68c 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -105,6 +105,7 @@ class URL(Base): relevant = Column(Boolean, nullable=True) created_at = get_created_at_column() updated_at = get_updated_at_column() + data_source_id = Column(Integer, nullable=True) # Relationships batch = relationship("Batch", back_populates="urls") diff --git a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py index ee1b8dc6..fc6e789b 100644 --- a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py +++ b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py @@ -8,4 +8,10 @@ class SubmitApprovedURLTDO(BaseModel): url: str record_type: RecordType - agency_id: Optional[int] \ No newline at end of file + agency_id: Optional[int] + name: str + description: str + record_formats: Optional[list[str]] = None + data_portal_type: Optional[str] = None + supplying_entity: Optional[str] = None + data_source_id: Optional[int] = None \ No newline at end of file diff --git a/core/classes/SubmitApprovedURLTaskOperator.py b/core/classes/SubmitApprovedURLTaskOperator.py index 633f8c1e..06b28a18 100644 --- a/core/classes/SubmitApprovedURLTaskOperator.py +++ b/core/classes/SubmitApprovedURLTaskOperator.py @@ -1,5 +1,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType +from core.DTOs.task_data_objects.UrlHtmlTDO import UrlHtmlTDO from core.classes.TaskOperatorBase import TaskOperatorBase from pdap_api_client.PDAPClient import PDAPClient From c5e75288a12345023487768c8c4f1924ad6412fa Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 1 Apr 2025 15:27:18 -0400 Subject: [PATCH 072/182] Set default for snippet if none exists. --- source_collectors/auto_googler/GoogleSearcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source_collectors/auto_googler/GoogleSearcher.py b/source_collectors/auto_googler/GoogleSearcher.py index 6f7b4cc8..7d599513 100644 --- a/source_collectors/auto_googler/GoogleSearcher.py +++ b/source_collectors/auto_googler/GoogleSearcher.py @@ -73,7 +73,7 @@ def get_query_results(self, query) -> list[GoogleSearchQueryResultsInnerDTO] or inner_dto = GoogleSearchQueryResultsInnerDTO( url=item["link"], title=item["title"], - snippet=item["snippet"] + snippet=item.get("snippet", ""), ) items.append(inner_dto) From bf756ac7d04187ae9a24a9c95e04ee689a555541 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 2 Apr 2025 11:49:22 -0400 Subject: [PATCH 073/182] DRAFT --- collector_db/AsyncDatabaseClient.py | 12 ++++++++++-- .../task_data_objects/URLMiscellaneousMetadataTDO.py | 4 ++++ core/classes/URLMiscellaneousMetadataTaskOperator.py | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 05006228..5b057636 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -12,7 +12,7 @@ from collector_db.DTOConverter import DTOConverter from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo -from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo +from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping from collector_db.DTOs.URLWithHTML import URLWithHTML @@ -37,7 +37,7 @@ GetURLsResponseInnerInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO -from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO +from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info @@ -375,6 +375,7 @@ async def get_pending_urls_missing_miscellaneous_metadata( query = ( query.options( selectinload(URL.batch), + selectinload(URL.html_content) ).limit(100).order_by(URL.id) ) @@ -387,6 +388,13 @@ async def get_pending_urls_missing_miscellaneous_metadata( collector_metadata=result.collector_metadata, collector_type=CollectorType(result.batch.strategy), ) + html_info = URLHTMLMetadataInfo() + for html_content in result.html_content: + if html_content.content_type == HTMLContentType.TITLE.value: + html_info.title = html_content.content + elif html_content.content_type == HTMLContentType.DESCRIPTION.value: + html_info.description = html_content.content + tdo.html_metadata_info = html_info final_results.append(tdo) return final_results diff --git a/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py b/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py index d57d1cba..ff173a8e 100644 --- a/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py +++ b/core/DTOs/task_data_objects/URLMiscellaneousMetadataTDO.py @@ -4,6 +4,9 @@ from collector_manager.enums import CollectorType +class URLHTMLMetadataInfo(BaseModel): + title: Optional[str] = None + description: Optional[str] = None class URLMiscellaneousMetadataTDO(BaseModel): url_id: int @@ -14,3 +17,4 @@ class URLMiscellaneousMetadataTDO(BaseModel): record_formats: Optional[list[str]] = None data_portal_type: Optional[str] = None supplying_entity: Optional[str] = None + html_metadata_info: Optional[URLHTMLMetadataInfo] = None diff --git a/core/classes/URLMiscellaneousMetadataTaskOperator.py b/core/classes/URLMiscellaneousMetadataTaskOperator.py index 38e7446a..4b9becdb 100644 --- a/core/classes/URLMiscellaneousMetadataTaskOperator.py +++ b/core/classes/URLMiscellaneousMetadataTaskOperator.py @@ -41,6 +41,12 @@ async def get_subtask(self, collector_type: CollectorType) -> MiscellaneousMetad case _: raise Exception(f"Unknown collector type: {collector_type}") + async def html_default_logic(self, tdo: URLMiscellaneousMetadataTDO): + if tdo.name is None: + tdo.name = tdo.html_metadata_info.title + if tdo.description is None: + tdo.description = tdo.html_metadata_info.description + async def inner_task_logic(self): tdos: list[URLMiscellaneousMetadataTDO] = await self.adb_client.get_pending_urls_missing_miscellaneous_metadata() await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) @@ -50,6 +56,7 @@ async def inner_task_logic(self): subtask = await self.get_subtask(tdo.collector_type) try: subtask.process(tdo) + await self.html_default_logic(tdo) except Exception as e: error_info = URLErrorPydanticInfo( task_id=self.task_id, From 8b33344fa286d8a1880aeec69b423ea85a292ab1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 2 Apr 2025 15:49:13 -0400 Subject: [PATCH 074/182] feat(app): Add batch filtering for annotation requests --- api/routes/annotate.py | 63 ++++-- api/routes/review.py | 19 +- collector_db/AsyncDatabaseClient.py | 51 +++-- core/AsyncCore.py | 82 +++++--- tests/helpers/complex_test_data_functions.py | 42 +++- .../integration/api/test_review.py | 6 +- .../collector_db/test_db_client.py | 194 ++++++++++++++---- 7 files changed, 351 insertions(+), 106 deletions(-) diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 53486d7d..84ba00e4 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -1,14 +1,13 @@ -from fastapi import APIRouter, Depends, Path +from typing import Optional + +from fastapi import APIRouter, Depends, Path, Query from api.dependencies import get_async_core -from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo -from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ - GetNextRelevanceAnnotationResponseOuterInfo +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo -from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from security_manager.SecurityManager import get_access_info, AccessInfo @@ -24,11 +23,15 @@ async def get_next_url_for_relevance_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextRelevanceAnnotationResponseOuterInfo: - result = await async_core.get_next_url_for_relevance_annotation( + return await async_core.get_next_url_for_relevance_annotation( user_id=access_info.user_id, + batch_id=batch_id ) - return result @annotate_router.post("/relevance/{url_id}") @@ -36,7 +39,11 @@ async def annotate_url_for_relevance_and_get_next_url( relevance_annotation_post_info: RelevanceAnnotationPostInfo, url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info) + access_info: AccessInfo = Depends(get_access_info), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextRelevanceAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate @@ -48,51 +55,71 @@ async def annotate_url_for_relevance_and_get_next_url( ) return await async_core.get_next_url_for_relevance_annotation( user_id=access_info.user_id, + batch_id=batch_id ) @annotate_router.get("/record-type") async def get_next_url_for_record_type_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextRecordTypeAnnotationResponseOuterInfo: - result = await async_core.get_next_url_for_record_type_annotation( + return await async_core.get_next_url_for_record_type_annotation( user_id=access_info.user_id, + batch_id=batch_id ) - return result @annotate_router.post("/record-type/{url_id}") async def annotate_url_for_record_type_and_get_next_url( record_type_annotation_post_info: RecordTypeAnnotationPostInfo, url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info) + access_info: AccessInfo = Depends(get_access_info), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextRecordTypeAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate """ - result = await async_core.submit_url_record_type_annotation( + await async_core.submit_url_record_type_annotation( user_id=access_info.user_id, url_id=url_id, record_type=record_type_annotation_post_info.record_type, ) - return result + return await async_core.get_next_url_for_record_type_annotation( + user_id=access_info.user_id, + batch_id=batch_id + ) @annotate_router.get("/agency") async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextURLForAgencyAnnotationResponse: - result = await async_core.get_next_url_agency_for_annotation( + return await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, + batch_id=batch_id ) - return result @annotate_router.post("/agency/{url_id}") async def annotate_url_for_agency_and_get_next_url( url_id: int, agency_annotation_post_info: URLAgencyAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info) + access_info: AccessInfo = Depends(get_access_info), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextURLForAgencyAnnotationResponse: """ Post URL annotation and get next URL to annotate @@ -102,7 +129,7 @@ async def annotate_url_for_agency_and_get_next_url( url_id=url_id, agency_post_info=agency_annotation_post_info ) - result = await async_core.get_next_url_agency_for_annotation( + return await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, + batch_id=batch_id ) - return result \ No newline at end of file diff --git a/api/routes/review.py b/api/routes/review.py index 61dccbbb..25ac85e8 100644 --- a/api/routes/review.py +++ b/api/routes/review.py @@ -1,4 +1,6 @@ -from fastapi import APIRouter, Depends +from typing import Optional + +from fastapi import APIRouter, Depends, Query from api.dependencies import get_async_core from core.AsyncCore import AsyncCore @@ -17,18 +19,27 @@ async def get_next_source( core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextURLForFinalReviewOuterResponse: - next_source = await core.get_next_source_for_review() + next_source = await core.get_next_source_for_review(batch_id=batch_id) return GetNextURLForFinalReviewOuterResponse(next_source=next_source) @review_router.post("/approve-source") async def approve_source( core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - approval_info: FinalReviewApprovalInfo = FinalReviewApprovalInfo + approval_info: FinalReviewApprovalInfo = FinalReviewApprovalInfo, + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), ) -> GetNextURLForFinalReviewOuterResponse: next_source = await core.approve_and_get_next_source_for_review( approval_info, - access_info=access_info + access_info=access_info, + batch_id=batch_id ) return GetNextURLForFinalReviewOuterResponse(next_source=next_source) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 05006228..7ff5f8ad 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2,7 +2,7 @@ from typing import Optional, Type, Any from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute from sqlalchemy.sql.functions import coalesce @@ -120,7 +120,8 @@ async def get_next_url_for_user_annotation( session: AsyncSession, user_suggestion_model_to_exclude: UserSuggestionModel, auto_suggestion_relationship: QueryableAttribute, - user_id: int + user_id: int, + batch_id: Optional[int] ) -> URL: url_query = ( select( @@ -139,12 +140,15 @@ async def get_next_url_for_user_annotation( ) ) ) - ).options( + ) + ) + if batch_id is not None: + url_query = url_query.where(URL.batch_id == batch_id) + + url_query = url_query.options( joinedload(auto_suggestion_relationship), joinedload(URL.html_content) - ). - limit(1) - ) + ).limit(1) raw_result = await session.execute(url_query) @@ -179,14 +183,16 @@ async def add_user_relevant_suggestion( async def get_next_url_for_relevance_annotation( self, session: AsyncSession, - user_id: int + user_id: int, + batch_id: Optional[int] ) -> Optional[GetNextRelevanceAnnotationResponseInfo]: url = await self.get_next_url_for_user_annotation( session, user_suggestion_model_to_exclude=UserRelevantSuggestion, auto_suggestion_relationship=URL.auto_relevant_suggestion, - user_id=user_id + user_id=user_id, + batch_id=batch_id ) if url is None: return None @@ -218,14 +224,16 @@ async def get_next_url_for_relevance_annotation( async def get_next_url_for_record_type_annotation( self, session: AsyncSession, - user_id: int + user_id: int, + batch_id: Optional[int] ) -> Optional[GetNextRecordTypeAnnotationResponseInfo]: url = await self.get_next_url_for_user_annotation( session, user_suggestion_model_to_exclude=UserRecordTypeSuggestion, auto_suggestion_relationship=URL.auto_record_type_suggestion, - user_id=user_id + user_id=user_id, + batch_id=batch_id ) if url is None: return None @@ -767,7 +775,10 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li @session_manager async def get_next_url_agency_for_annotation( - self, session: AsyncSession, user_id: int + self, + session: AsyncSession, + user_id: int, + batch_id: Optional[int] ) -> GetNextURLForAgencyAnnotationResponse: """ Retrieve URL for annotation @@ -785,8 +796,14 @@ async def get_next_url_agency_for_annotation( URL.outcome == URLStatus.PENDING.value ) ) - # Must not have been annotated by this user - .join(UserUrlAgencySuggestion, isouter=True) + ) + + if batch_id is not None: + statement = statement.where(URL.batch_id == batch_id) + + # Must not have been annotated by this user + statement = ( + statement.join(UserUrlAgencySuggestion, isouter=True) .where( ~exists( select(UserUrlAgencySuggestion). @@ -947,7 +964,8 @@ async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[ @session_manager async def get_next_url_for_final_review( self, - session: AsyncSession + session: AsyncSession, + batch_id: Optional[int] ) -> Optional[GetNextURLForFinalReviewResponse]: @@ -1029,6 +1047,10 @@ def count_subquery(model: Type[Base]): url_query = url_query.where( URL.outcome == URLStatus.PENDING.value ) + if batch_id is not None: + url_query = url_query.where( + URL.batch_id == batch_id + ) # The below relationships are joined directly to the URL single_join_relationships = [ @@ -1060,6 +1082,7 @@ def count_subquery(model: Type[Base]): url_query = url_query.order_by( desc("total_distinct_annotation_count"), desc("total_overall_annotation_count"), + asc(URL.id) ) # Apply limit diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 8b422d7d..43b81176 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from aiohttp import ClientSession @@ -52,6 +53,12 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) + + async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: + return await self.adb_client.get_urls(page=page, errors=errors) + + + #region Task Operators async def get_url_html_task_operator(self): self.logger.info("Running URL HTML Task") operator = URLHTMLTaskOperator( @@ -107,6 +114,9 @@ async def get_task_operators(self) -> list[TaskOperatorBase]: await self.get_url_miscellaneous_metadata_task_operator() ] + #endregion + + #region Tasks async def run_tasks(self): operators = await self.get_task_operators() for operator in operators: @@ -141,6 +151,17 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): await self.adb_client.update_task_status(task_id=run_info.task_id, status=BatchStatus.ERROR) await self.adb_client.add_task_error(task_id=run_info.task_id, error=run_info.message) + async def get_task_info(self, task_id: int) -> TaskInfo: + return await self.adb_client.get_task_info(task_id=task_id) + + async def get_tasks(self, page: int, task_type: TaskType, task_status: BatchStatus) -> GetTasksResponse: + return await self.adb_client.get_tasks(page=page, task_type=task_type, task_status=task_status) + + + #endregion + + #region Annotations and Review + async def submit_url_relevance_annotation( self, user_id: int, @@ -153,14 +174,28 @@ async def submit_url_relevance_annotation( relevant=relevant ) - async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextRelevanceAnnotationResponseOuterInfo: - next_annotation = await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + async def get_next_url_for_relevance_annotation( + self, + user_id: int, + batch_id: Optional[int] + ) -> GetNextRelevanceAnnotationResponseOuterInfo: + next_annotation = await self.adb_client.get_next_url_for_relevance_annotation( + user_id=user_id, + batch_id=batch_id + ) return GetNextRelevanceAnnotationResponseOuterInfo( next_annotation=next_annotation ) - async def get_next_url_for_record_type_annotation(self, user_id: int) -> GetNextRecordTypeAnnotationResponseOuterInfo: - next_annotation = await self.adb_client.get_next_url_for_record_type_annotation(user_id=user_id) + async def get_next_url_for_record_type_annotation( + self, + user_id: int, + batch_id: Optional[int] + ) -> GetNextRecordTypeAnnotationResponseOuterInfo: + next_annotation = await self.adb_client.get_next_url_for_record_type_annotation( + user_id=user_id, + batch_id=batch_id + ) return GetNextRecordTypeAnnotationResponseOuterInfo( next_annotation=next_annotation ) @@ -169,33 +204,24 @@ async def submit_url_record_type_annotation( self, user_id: int, url_id: int, - record_type: RecordType + record_type: RecordType, ): await self.adb_client.add_user_record_type_suggestion( user_id=user_id, url_id=url_id, record_type=record_type ) - next_annotation = await self.adb_client.get_next_url_for_record_type_annotation(user_id=user_id) - return GetNextRecordTypeAnnotationResponseOuterInfo( - next_annotation=next_annotation - ) - - async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: - return await self.adb_client.get_urls(page=page, errors=errors) - - async def get_task_info(self, task_id: int) -> TaskInfo: - return await self.adb_client.get_task_info(task_id=task_id) - - async def get_tasks(self, page: int, task_type: TaskType, task_status: BatchStatus) -> GetTasksResponse: - return await self.adb_client.get_tasks(page=page, task_type=task_type, task_status=task_status) async def get_next_url_agency_for_annotation( self, - user_id: int + user_id: int, + batch_id: Optional[int] ) -> GetNextURLForAgencyAnnotationResponse: - return await self.adb_client.get_next_url_agency_for_annotation(user_id=user_id) + return await self.adb_client.get_next_url_agency_for_annotation( + user_id=user_id, + batch_id=batch_id + ) async def submit_url_agency_annotation( self, @@ -217,17 +243,25 @@ async def submit_url_agency_annotation( is_new=agency_post_info.is_new, ) - async def get_next_source_for_review(self): - return await self.adb_client.get_next_url_for_final_review() + async def get_next_source_for_review( + self, + batch_id: Optional[int] + ): + return await self.adb_client.get_next_url_for_final_review( + batch_id=batch_id + ) async def approve_and_get_next_source_for_review( self, approval_info: FinalReviewApprovalInfo, - access_info: AccessInfo + access_info: AccessInfo, + batch_id: Optional[int] ): await self.adb_client.approve_url( approval_info=approval_info, user_id=access_info.user_id ) - return await self.get_next_source_for_review() + return await self.get_next_source_for_review( + batch_id=batch_id + ) diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 4aa9a86f..104402c0 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -1,14 +1,44 @@ -from collector_db.enums import URLMetadataAttributeType, ValidationSource, ValidationStatus +from pydantic import BaseModel + +from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo +from collector_db.DTOs.URLMapping import URLMapping +from collector_manager.enums import URLStatus from core.enums import RecordType from tests.helpers.DBDataCreator import DBDataCreator +class AnnotationSetupInfo(BaseModel): + batch_id: int + insert_urls_info: InsertURLsInfo + +async def setup_for_get_next_url_for_annotation( + db_data_creator: DBDataCreator, + url_count: int, + outcome: URLStatus = URLStatus.PENDING +) -> AnnotationSetupInfo: + batch_id = db_data_creator.batch() + insert_urls_info = db_data_creator.urls( + batch_id=batch_id, + url_count=url_count, + outcome=outcome + ) + await db_data_creator.html_data( + [ + url.url_id for url in insert_urls_info.url_mappings + ] + ) + return AnnotationSetupInfo(batch_id=batch_id, insert_urls_info=insert_urls_info) + + +class FinalReviewSetupInfo(BaseModel): + batch_id: int + url_mapping: URLMapping async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, annotation_count: int, include_user_annotations: bool = True, include_miscellaneous_metadata: bool = True -): +) -> FinalReviewSetupInfo: """ Sets up the database to test the final_review functions Auto-labels the URL with 'relevant=True' and 'record_type=ARREST_RECORDS' @@ -30,7 +60,7 @@ async def add_agency_suggestion(count: int): await db_data_creator.agency_user_suggestions( url_id=url_mapping.url_id, agency_id=agency_id - ) + ) async def add_record_type_suggestion(count: int, record_type: RecordType): for i in range(count): @@ -68,5 +98,7 @@ async def add_relevant_suggestion(count: int, relevant: bool): for i in range(annotation_count): await add_agency_suggestion(i + 1) - - return url_mapping + return FinalReviewSetupInfo( + batch_id=batch_id, + url_mapping=url_mapping + ) diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index 009a7638..b4a94387 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -13,11 +13,12 @@ async def test_review_next_source(api_test_helper): ath = api_test_helper - url_mapping = await setup_for_get_next_url_for_final_review( + setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping = setup_info.url_mapping await ath.db_data_creator.agency_auto_suggestions( url_id=url_mapping.url_id, @@ -81,11 +82,12 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): ath = api_test_helper db_data_creator = ath.db_data_creator - url_mapping = await setup_for_get_next_url_for_final_review( + setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping = setup_info.url_mapping # Add confirmed agency confirmed_agency = await db_data_creator.confirmed_suggestions([url_mapping.url_id]) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index a2cb25a9..12031afa 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -17,6 +17,7 @@ from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType +from helpers.complex_test_data_functions import setup_for_get_next_url_for_annotation from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review @@ -156,19 +157,23 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato Test that an annotated URL is returned """ - url_mapping = await setup_for_get_next_url_for_final_review( + setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping = setup_info.url_mapping + await db_data_creator.agency_auto_suggestions( url_id=url_mapping.url_id, count=3 ) - result = await db_data_creator.adb_client.get_next_url_for_final_review() + result = await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) assert result.url == url_mapping.url html_info = result.html_info @@ -202,6 +207,36 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato for i in range(3): assert user_agency_suggestions_as_list[i].count == 3 - i +@pytest.mark.asyncio +async def test_get_next_url_for_final_review_batch_id_filtering(db_data_creator: DBDataCreator): + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + + url_mapping_1 = setup_info_1.url_mapping + url_mapping_2 = setup_info_2.url_mapping + + # If a batch id is provided, return first valid URL with that batch id + result_with_batch_id =await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=setup_info_2.batch_id + ) + + assert result_with_batch_id.url == url_mapping_2.url + + # If no batch id is provided, return first valid URL + result_no_batch_id =await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) + + assert result_no_batch_id.url == url_mapping_1.url @pytest.mark.asyncio @@ -211,17 +246,19 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat i.e., if one has annotations for record type and agency id, that should be favored over one with just record type """ - url_mapping_without_user_anno = await setup_for_get_next_url_for_final_review( + setup_info_without_user_anno = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=False ) + url_mapping_without_user_anno = setup_info_without_user_anno.url_mapping - url_mapping_with_user_anno = await setup_for_get_next_url_for_final_review( + setup_info_with_user_anno = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping_with_user_anno = setup_info_with_user_anno.url_mapping # Have both be listed as unknown @@ -232,7 +269,9 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat suggestion_type=SuggestionType.UNKNOWN ) - result = await db_data_creator.adb_client.get_next_url_for_final_review() + result = await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) assert result.id == url_mapping_with_user_anno.url_id @@ -246,24 +285,28 @@ async def test_get_next_url_for_final_review_favor_more_annotations( """ Test in the case of two URLs with the same number of components annotated, favoring the one with more total annotations """ - url_mapping_lower_count = await setup_for_get_next_url_for_final_review( + setup_info_lower_count = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=1, include_user_annotations=True ) + url_mapping_lower_count = setup_info_lower_count.url_mapping - url_mapping_higher_count = await setup_for_get_next_url_for_final_review( + setup_info_higher_count = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping_higher_count = setup_info_higher_count.url_mapping for url_mapping in [url_mapping_lower_count, url_mapping_higher_count]: await db_data_creator.agency_confirmed_suggestion( url_id=url_mapping.url_id ) - result = await db_data_creator.adb_client.get_next_url_for_final_review() + result = await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) assert result.id == url_mapping_higher_count.url_id @@ -281,7 +324,9 @@ async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBD batch_id = db_data_creator.batch() url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] - result = await db_data_creator.adb_client.get_next_url_for_final_review() + result = await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) assert result.id == url_mapping.url_id @@ -314,17 +359,20 @@ async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator outcome=URLStatus.SUBMITTED ).url_mappings[0] - result = await db_data_creator.adb_client.get_next_url_for_final_review() + result = await db_data_creator.adb_client.get_next_url_for_final_review( + batch_id=None + ) assert result is None @pytest.mark.asyncio async def test_approve_url_basic(db_data_creator: DBDataCreator): - url_mapping = await setup_for_get_next_url_for_final_review( + setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True ) + url_mapping = setup_info.url_mapping # Add confirmed agency agency_id = await db_data_creator.agency_confirmed_suggestion( @@ -343,7 +391,7 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): ) # Confirm same agency id is listed as confirmed - urls = await adb_client.get_all(URL) + urls: list[URL] = await adb_client.get_all(URL) assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id @@ -353,17 +401,17 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert url.name == "Test Name" assert url.description == "Test Description" - confirmed_agency = await adb_client.get_all(ConfirmedURLAgency) + confirmed_agency: list[ConfirmedURLAgency] = await adb_client.get_all(ConfirmedURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id assert confirmed_agency[0].agency_id == agency_id - approving_user_urls = await adb_client.get_all(ApprovingUserURL) + approving_user_urls: list[ApprovingUserURL] = await adb_client.get_all(ApprovingUserURL) assert len(approving_user_urls) == 1 assert approving_user_urls[0].user_id == 1 assert approving_user_urls[0].url_id == url_mapping.url_id - optional_metadata = await adb_client.get_all(URLOptionalDataSourceMetadata) + optional_metadata: list[URLOptionalDataSourceMetadata] = await adb_client.get_all(URLOptionalDataSourceMetadata) assert len(optional_metadata) == 1 assert optional_metadata[0].url_id == url_mapping.url_id assert optional_metadata[0].record_formats == ["Test Record Format", "Test Record Format 2"] @@ -372,12 +420,13 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): @pytest.mark.asyncio async def test_approval_url_error(db_data_creator: DBDataCreator): - url_mapping = await setup_for_get_next_url_for_final_review( + setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, annotation_count=3, include_user_annotations=True, include_miscellaneous_metadata=False ) + url_mapping = setup_info.url_mapping # Set all required descriptors to none and receive an error adb_client = db_data_creator.adb_client @@ -427,17 +476,12 @@ async def test_approval_url_error(db_data_creator: DBDataCreator): async def test_get_next_url_for_user_relevance_annotation_pending( db_data_creator: DBDataCreator ): - - batch_id = db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = db_data_creator.urls( - batch_id=batch_id, - url_count=1, - outcome=URLStatus.PENDING + setup_info = await setup_for_get_next_url_for_annotation( + db_data_creator=db_data_creator, + url_count=2 ) - url_1 = iui.url_mappings[0] + url_1 = setup_info.insert_urls_info.url_mappings[0] # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( @@ -445,15 +489,91 @@ async def test_get_next_url_for_user_relevance_annotation_pending( relevant=True ) - # Add HTML data - await db_data_creator.html_data([url_1.url_id]) - adb_client = db_data_creator.adb_client url = await adb_client.get_next_url_for_relevance_annotation( - user_id=1 + user_id=1, + batch_id=None ) assert url is not None +@pytest.mark.asyncio +async def test_get_next_url_for_annotation_batch_filtering( + db_data_creator: DBDataCreator +): + """ + Test that for all annotation retrievals, batch filtering works as expected + """ + setup_info_1 = await setup_for_get_next_url_for_annotation( + db_data_creator=db_data_creator, + url_count=1 + ) + setup_info_2 = await setup_for_get_next_url_for_annotation( + db_data_creator=db_data_creator, + url_count=1 + ) + + url_1 = setup_info_1.insert_urls_info.url_mappings[0] + url_2 = setup_info_2.insert_urls_info.url_mappings[0] + + # Test for relevance + # If a batch id is provided, return first valid URL with that batch id + result_with_batch_id = await db_data_creator.adb_client.get_next_url_for_relevance_annotation( + user_id=1, + batch_id=setup_info_2.batch_id + ) + + assert result_with_batch_id.url_info.url == url_2.url + + # If no batch id is provided, return first valid URL + result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_relevance_annotation( + user_id=1, + batch_id=None + ) + + assert result_no_batch_id.url_info.url == url_1.url + + # Test for record type + # If a batch id is provided, return first valid URL with that batch id + result_with_batch_id = await db_data_creator.adb_client.get_next_url_for_record_type_annotation( + user_id=1, + batch_id=setup_info_2.batch_id + ) + + assert result_with_batch_id.url_info.url == url_2.url + + # If no batch id is provided, return first valid URL + result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_record_type_annotation( + user_id=1, + batch_id=None + ) + + assert result_no_batch_id.url_info.url == url_1.url + + # Test for agency + for url in [url_1, url_2]: + await db_data_creator.auto_suggestions( + url_ids=[url.url_id], + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # If a batch id is provided, return first valid URL with that batch id + result_with_batch_id = await db_data_creator.adb_client.get_next_url_agency_for_annotation( + user_id=1, + batch_id=setup_info_2.batch_id + ) + + assert result_with_batch_id.next_annotation.url == url_2.url + + # If no batch id is provided, return first valid URL + result_no_batch_id = await db_data_creator.adb_client.get_next_url_agency_for_annotation( + user_id=1, + batch_id=None + ) + + assert result_no_batch_id.next_annotation.url == url_1.url + + @pytest.mark.asyncio async def test_get_next_url_for_user_relevance_annotation_validated( db_data_creator: DBDataCreator @@ -462,16 +582,14 @@ async def test_get_next_url_for_user_relevance_annotation_validated( A validated URL should not turn up in get_next_url_for_user_annotation """ - batch_id = db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = db_data_creator.urls( - batch_id=batch_id, + setup_info = await setup_for_get_next_url_for_annotation( + db_data_creator=db_data_creator, url_count=1, outcome=URLStatus.VALIDATED ) - url_1 = iui.url_mappings[0] + + url_1 = setup_info.insert_urls_info.url_mappings[0] # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( @@ -479,11 +597,9 @@ async def test_get_next_url_for_user_relevance_annotation_validated( relevant=True ) - # Add HTML data - await db_data_creator.html_data([url_1.url_id]) - adb_client = db_data_creator.adb_client url = await adb_client.get_next_url_for_relevance_annotation( - user_id=1 + user_id=1, + batch_id=None ) assert url is None \ No newline at end of file From eae4979c965f69de1414617fe5c78cf0c611c5b3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 2 Apr 2025 16:01:48 -0400 Subject: [PATCH 075/182] fix(tests): fix import bug --- .../integration/collector_db/test_db_client.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 12031afa..09a27a73 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -1,23 +1,18 @@ from datetime import datetime, timedelta import pytest -from _pytest.outcomes import fail from fastapi import HTTPException from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo -from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, \ - UserRelevantSuggestion +from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType -from helpers.complex_test_data_functions import setup_for_get_next_url_for_annotation +from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_annotation from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review From b669eaba1696d556872b1b460baee041b37187bd Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 2 Apr 2025 21:23:21 -0400 Subject: [PATCH 076/182] feat(app): add `review/reject-source` endpoint Additionally, modify `review/approve-source` to no longer accept `relevant` key-value pair. --- ...-4c70177eba78_add_rejected_batch_status.py | 47 +++++++++++++++++++ ...rename_approving_user_url_to_reviewing_.py | 25 ++++++++++ ...8fe75d_remove_relevant_column_from_urls.py | 27 +++++++++++ api/routes/review.py | 23 +++++++-- collector_db/AsyncDatabaseClient.py | 37 +++++++++++---- collector_db/models.py | 12 ++--- collector_manager/enums.py | 1 + core/AsyncCore.py | 17 +++++-- core/DTOs/FinalReviewApprovalInfo.py | 10 ++-- .../api/helpers/RequestValidator.py | 20 +++++--- .../integration/api/test_review.py | 30 ++++++++++-- .../collector_db/test_db_client.py | 5 +- 12 files changed, 211 insertions(+), 43 deletions(-) create mode 100644 alembic/versions/2025_04_02_2040-4c70177eba78_add_rejected_batch_status.py create mode 100644 alembic/versions/2025_04_02_2051-e3fe6d099583_rename_approving_user_url_to_reviewing_.py create mode 100644 alembic/versions/2025_04_02_2114-45271f8fe75d_remove_relevant_column_from_urls.py diff --git a/alembic/versions/2025_04_02_2040-4c70177eba78_add_rejected_batch_status.py b/alembic/versions/2025_04_02_2040-4c70177eba78_add_rejected_batch_status.py new file mode 100644 index 00000000..fcb9821b --- /dev/null +++ b/alembic/versions/2025_04_02_2040-4c70177eba78_add_rejected_batch_status.py @@ -0,0 +1,47 @@ +"""Add rejected batch status + +Revision ID: 4c70177eba78 +Revises: 5ea47dacd0ef +Create Date: 2025-04-02 20:40:54.982954 + +""" +from typing import Sequence, Union + + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '4c70177eba78' +down_revision: Union[str, None] = '5ea47dacd0ef' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=[ + 'pending', + 'submitted', + 'validated', + 'duplicate', + 'rejected', + 'error' + ] + ) + +def downgrade() -> None: + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=[ + 'pending', + 'submitted', + 'validated', + 'duplicate', + 'error', + ] + ) diff --git a/alembic/versions/2025_04_02_2051-e3fe6d099583_rename_approving_user_url_to_reviewing_.py b/alembic/versions/2025_04_02_2051-e3fe6d099583_rename_approving_user_url_to_reviewing_.py new file mode 100644 index 00000000..c9c4eec1 --- /dev/null +++ b/alembic/versions/2025_04_02_2051-e3fe6d099583_rename_approving_user_url_to_reviewing_.py @@ -0,0 +1,25 @@ +"""Rename approving_user_url to reviewing_user_url + +Revision ID: e3fe6d099583 +Revises: 4c70177eba78 +Create Date: 2025-04-02 20:51:10.738159 + +""" +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = 'e3fe6d099583' +down_revision: Union[str, None] = '4c70177eba78' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.rename_table('approving_user_url', 'reviewing_user_url') + + +def downgrade() -> None: + op.rename_table('reviewing_user_url', 'approving_user_url') diff --git a/alembic/versions/2025_04_02_2114-45271f8fe75d_remove_relevant_column_from_urls.py b/alembic/versions/2025_04_02_2114-45271f8fe75d_remove_relevant_column_from_urls.py new file mode 100644 index 00000000..3f884391 --- /dev/null +++ b/alembic/versions/2025_04_02_2114-45271f8fe75d_remove_relevant_column_from_urls.py @@ -0,0 +1,27 @@ +"""Remove relevant column from urls + +Revision ID: 45271f8fe75d +Revises: e3fe6d099583 +Create Date: 2025-04-02 21:14:29.778488 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '45271f8fe75d' +down_revision: Union[str, None] = 'e3fe6d099583' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.drop_column('urls', 'relevant') + + + +def downgrade() -> None: + op.add_column('urls', sa.Column('relevant', sa.BOOLEAN(), nullable=True)) diff --git a/api/routes/review.py b/api/routes/review.py index 25ac85e8..649e0b39 100644 --- a/api/routes/review.py +++ b/api/routes/review.py @@ -4,7 +4,7 @@ from api.dependencies import get_async_core from core.AsyncCore import AsyncCore -from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \ GetNextURLForFinalReviewOuterResponse from security_manager.SecurityManager import AccessInfo, get_access_info @@ -37,9 +37,26 @@ async def approve_source( "If not specified, defaults to first qualifying URL", default=None), ) -> GetNextURLForFinalReviewOuterResponse: - next_source = await core.approve_and_get_next_source_for_review( + await core.approve_url( approval_info, access_info=access_info, - batch_id=batch_id ) + next_source = await core.get_next_source_for_review(batch_id=batch_id) + return GetNextURLForFinalReviewOuterResponse(next_source=next_source) + +@review_router.post("/reject-source") +async def reject_source( + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), + review_info: FinalReviewBaseInfo = FinalReviewBaseInfo, + batch_id: Optional[int] = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None), +) -> GetNextURLForFinalReviewOuterResponse: + await core.reject_url( + url_id=review_info.url_id, + access_info=access_info, + ) + next_source = await core.get_next_source_for_review(batch_id=batch_id) return GetNextURLForFinalReviewOuterResponse(next_source=next_source) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 7ff5f8ad..e8105f55 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -23,7 +23,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency + UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus, CollectorType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo @@ -1182,12 +1182,6 @@ def update_if_not_none( approval_info.record_type.value if approval_info.record_type is not None else None, required=True ) - update_if_not_none( - url, - "relevant", - approval_info.relevant, - required=True - ) # Get existing agency ids existing_agencies = url.confirmed_agencies or [] @@ -1263,10 +1257,35 @@ def update_if_not_none( ) # Add approving user - - approving_user_url = ApprovingUserURL( + approving_user_url = ReviewingUserURL( user_id=user_id, url_id=approval_info.url_id ) session.add(approving_user_url) + + @session_manager + async def reject_url( + self, + session: AsyncSession, + url_id: int, + user_id: int + ) -> None: + + query = ( + Select(URL) + .where(URL.id == url_id) + ) + + url = await session.execute(query) + url = url.scalars().first() + + url.outcome = URLStatus.REJECTED.value + + # Add rejecting user + rejecting_user_url = ReviewingUserURL( + user_id=user_id, + url_id=url_id + ) + + session.add(rejecting_user_url) \ No newline at end of file diff --git a/collector_db/models.py b/collector_db/models.py index 55b75af2..e420961f 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -95,6 +95,7 @@ class URL(Base): 'pending', 'submitted', 'validated', + 'rejected', 'duplicate', 'error', name='url_status' @@ -102,7 +103,6 @@ class URL(Base): nullable=False ) record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) - relevant = Column(Boolean, nullable=True) created_at = get_created_at_column() updated_at = get_updated_at_column() @@ -128,8 +128,8 @@ class URL(Base): "AutoRelevantSuggestion", uselist=False, back_populates="url") user_relevant_suggestions = relationship( "UserRelevantSuggestion", back_populates="url") - approving_users = relationship( - "ApprovingUserURL", back_populates="url") + reviewing_users = relationship( + "ReviewingUserURL", back_populates="url") optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( @@ -149,8 +149,8 @@ class URLOptionalDataSourceMetadata(Base): # Relationships url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") -class ApprovingUserURL(Base): - __tablename__ = 'approving_user_url' +class ReviewingUserURL(Base): + __tablename__ = 'reviewing_user_url' __table_args__ = ( UniqueConstraint( "url_id", @@ -163,7 +163,7 @@ class ApprovingUserURL(Base): created_at = get_created_at_column() # Relationships - url = relationship("URL", back_populates="approving_users") + url = relationship("URL", back_populates="reviewing_users") class RootURL(Base): __tablename__ = 'root_url_cache' diff --git a/collector_manager/enums.py b/collector_manager/enums.py index e90ee7db..692b97e5 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -15,3 +15,4 @@ class URLStatus(Enum): VALIDATED = "validated" ERROR = "error" DUPLICATE = "duplicate" + REJECTED = "rejected" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 43b81176..28a14fa2 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -251,17 +251,24 @@ async def get_next_source_for_review( batch_id=batch_id ) - async def approve_and_get_next_source_for_review( + async def approve_url( self, approval_info: FinalReviewApprovalInfo, - access_info: AccessInfo, - batch_id: Optional[int] + access_info: AccessInfo ): await self.adb_client.approve_url( approval_info=approval_info, user_id=access_info.user_id ) - return await self.get_next_source_for_review( - batch_id=batch_id + + + async def reject_url( + self, + url_id: int, + access_info: AccessInfo, + ): + await self.adb_client.reject_url( + url_id=url_id, + user_id=access_info.user_id ) diff --git a/core/DTOs/FinalReviewApprovalInfo.py b/core/DTOs/FinalReviewApprovalInfo.py index e24c3c75..d87fb628 100644 --- a/core/DTOs/FinalReviewApprovalInfo.py +++ b/core/DTOs/FinalReviewApprovalInfo.py @@ -4,21 +4,17 @@ from core.enums import RecordType - -class FinalReviewApprovalInfo(BaseModel): +class FinalReviewBaseInfo(BaseModel): url_id: int = Field( title="The id of the URL." ) + +class FinalReviewApprovalInfo(FinalReviewBaseInfo): record_type: Optional[RecordType] = Field( title="The final record type of the URL." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None ) - relevant: Optional[bool] = Field( - title="Final determination on whether the URL is relevant." - "If none, defers to the existing value from the auto-labeler only if it exists.", - default=None - ) agency_ids: Optional[list[int]] = Field( title="The final confirmed agencies for the URL. " "If none, defers to an existing confirmed agency only if that exists.", diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index e2c8a479..02a51b29 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -9,22 +9,18 @@ from collector_db.enums import TaskType from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType -from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo -from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo, \ - GetNextRelevanceAnnotationResponseOuterInfo +from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo -from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse -from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \ - GetNextURLForFinalReviewOuterResponse +from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo -from core.DTOs.MessageCountResponse import MessageCountResponse from core.DTOs.MessageResponse import MessageResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo @@ -276,3 +272,13 @@ async def approve_and_get_next_source_for_review( json=approval_info.model_dump(mode='json') ) return GetNextURLForFinalReviewOuterResponse(**data) + + async def reject_and_get_next_source_for_review( + self, + review_info: FinalReviewBaseInfo + ) -> GetNextURLForFinalReviewOuterResponse: + data = self.post( + url=f"/review/reject-source", + json=review_info.model_dump(mode='json') + ) + return GetNextURLForFinalReviewOuterResponse(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index b4a94387..61b1ef7e 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -3,7 +3,7 @@ from collector_db.constants import PLACEHOLDER_AGENCY_NAME from collector_db.models import URL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Agency from collector_manager.enums import URLStatus -from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse from core.enums import RecordType from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review @@ -102,7 +102,6 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): approval_info=FinalReviewApprovalInfo( url_id=url_mapping.url_id, record_type=RecordType.ARREST_RECORDS, - relevant=True, agency_ids=agency_ids, name="New Test Name", description="New Test Description", @@ -121,7 +120,6 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -144,4 +142,30 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): if agency.agency_id == additional_agency: assert agency.name == PLACEHOLDER_AGENCY_NAME +@pytest.mark.asyncio +async def test_reject_and_get_next_source_for_review(api_test_helper): + ath = api_test_helper + db_data_creator = ath.db_data_creator + setup_info = await setup_for_get_next_url_for_final_review( + db_data_creator=db_data_creator, + annotation_count=3, + include_user_annotations=True + ) + url_mapping = setup_info.url_mapping + + result: GetNextURLForFinalReviewOuterResponse = await ath.request_validator.reject_and_get_next_source_for_review( + review_info=FinalReviewBaseInfo( + url_id=url_mapping.url_id, + ) + ) + + assert result.next_source is None + + adb_client = db_data_creator.adb_client + # Confirm same agency id is listed as rejected + urls = await adb_client.get_all(URL) + assert len(urls) == 1 + url = urls[0] + assert url.id == url_mapping.url_id + assert url.outcome == URLStatus.REJECTED.value \ No newline at end of file diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 09a27a73..67d27d09 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -8,7 +8,7 @@ from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo -from collector_db.models import URL, ApprovingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency +from collector_db.models import URL, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType @@ -391,7 +391,6 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.relevant == True assert url.outcome == URLStatus.VALIDATED.value assert url.name == "Test Name" assert url.description == "Test Description" @@ -401,7 +400,7 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert confirmed_agency[0].url_id == url_mapping.url_id assert confirmed_agency[0].agency_id == agency_id - approving_user_urls: list[ApprovingUserURL] = await adb_client.get_all(ApprovingUserURL) + approving_user_urls: list[ReviewingUserURL] = await adb_client.get_all(ReviewingUserURL) assert len(approving_user_urls) == 1 assert approving_user_urls[0].user_id == 1 assert approving_user_urls[0].url_id == url_mapping.url_id From ea23d0c86f0611e6e7cea7ba200bb0423d494eae Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 4 Apr 2025 07:09:02 -0400 Subject: [PATCH 077/182] feat(database): Adjust annotation logic for URLs marked not relevant Now, URLs marked not relevant by a user should not show up for subsequent annotations such as record type or agency. --- collector_db/AsyncDatabaseClient.py | 33 +++++++++- .../collector_db/test_db_client.py | 61 ++++++++++++++++++- 2 files changed, 91 insertions(+), 3 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index e8105f55..7914e483 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -121,7 +121,8 @@ async def get_next_url_for_user_annotation( user_suggestion_model_to_exclude: UserSuggestionModel, auto_suggestion_relationship: QueryableAttribute, user_id: int, - batch_id: Optional[int] + batch_id: Optional[int], + check_if_annotated_not_relevant: bool = False ) -> URL: url_query = ( select( @@ -142,6 +143,21 @@ async def get_next_url_for_user_annotation( ) ) ) + + if check_if_annotated_not_relevant: + url_query = url_query.where( + not_( + exists( + select(UserRelevantSuggestion) + .where( + UserRelevantSuggestion.url_id == URL.id, + UserRelevantSuggestion.user_id == user_id, + UserRelevantSuggestion.relevant == False + ) + ) + ) + ) + if batch_id is not None: url_query = url_query.where(URL.batch_id == batch_id) @@ -233,7 +249,8 @@ async def get_next_url_for_record_type_annotation( user_suggestion_model_to_exclude=UserRecordTypeSuggestion, auto_suggestion_relationship=URL.auto_record_type_suggestion, user_id=user_id, - batch_id=batch_id + batch_id=batch_id, + check_if_annotated_not_relevant=True ) if url is None: return None @@ -832,6 +849,18 @@ async def get_next_url_agency_for_annotation( correlate(URL) ) ) + # Must not have been marked as "Not Relevant" by this user + .join(UserRelevantSuggestion, isouter=True) + .where( + ~exists( + select(UserRelevantSuggestion). + where( + (UserRelevantSuggestion.user_id == user_id) & + (UserRelevantSuggestion.url_id == URL.id) & + (UserRelevantSuggestion.relevant == False) + ).correlate(URL) + ) + ) ).limit(1) raw_result = await session.execute(statement) results = raw_result.all() diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 67d27d09..7af9d5a2 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -8,6 +8,7 @@ from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo +from collector_db.DTOs.URLMapping import URLMapping from collector_db.models import URL, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -596,4 +597,62 @@ async def test_get_next_url_for_user_relevance_annotation_validated( user_id=1, batch_id=None ) - assert url is None \ No newline at end of file + assert url is None + +@pytest.mark.asyncio +async def test_annotate_url_marked_not_relevant(db_data_creator: DBDataCreator): + """ + If a URL is marked not relevant by the user, they should not receive that URL + in calls to get an annotation for record type or agency + Other users should still receive the URL + """ + setup_info = await setup_for_get_next_url_for_annotation( + db_data_creator=db_data_creator, + url_count=2 + ) + adb_client = db_data_creator.adb_client + url_to_mark_not_relevant: URLMapping = setup_info.insert_urls_info.url_mappings[0] + url_to_mark_relevant: URLMapping = setup_info.insert_urls_info.url_mappings[1] + for url_mapping in setup_info.insert_urls_info.url_mappings: + await db_data_creator.agency_auto_suggestions( + url_id=url_mapping.url_id, + count=3 + ) + await adb_client.add_user_relevant_suggestion( + user_id=1, + url_id=url_to_mark_not_relevant.url_id, + relevant=False + ) + await adb_client.add_user_relevant_suggestion( + user_id=1, + url_id=url_to_mark_relevant.url_id, + relevant=True + ) + + # User should not receive the URL for record type annotation + record_type_annotation_info = await adb_client.get_next_url_for_record_type_annotation( + user_id=1, + batch_id=None + ) + assert record_type_annotation_info.url_info.url_id != url_to_mark_not_relevant.url_id + + # Other users should still receive the URL for record type annotation + record_type_annotation_info = await adb_client.get_next_url_for_record_type_annotation( + user_id=2, + batch_id=None + ) + assert record_type_annotation_info.url_info.url_id == url_to_mark_not_relevant.url_id + + # User should not receive the URL for agency annotation + agency_annotation_info = await adb_client.get_next_url_agency_for_annotation( + user_id=1, + batch_id=None + ) + assert agency_annotation_info.next_annotation.url_id != url_to_mark_not_relevant.url_id + + # Other users should still receive the URL for agency annotation + agency_annotation_info = await adb_client.get_next_url_agency_for_annotation( + user_id=2, + batch_id=None + ) + assert agency_annotation_info.next_annotation.url_id == url_to_mark_not_relevant.url_id From c20e8ac434b00a386354647f7d98a19f1b0acc5f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 4 Apr 2025 07:45:26 -0400 Subject: [PATCH 078/182] feat(database): add agency not in database in annotate agencies Previously, the `/annotate/agencies` `POST` method would return a 500 error when an agency whose ID was not yet in the DBI database was submitted. This has been resolved. --- collector_db/AsyncDatabaseClient.py | 12 +++++ tests/helpers/complex_test_data_functions.py | 23 +++++++- .../integration/api/test_annotate.py | 53 +++++++------------ .../collector_db/test_db_client.py | 24 ++++++++- 4 files changed, 75 insertions(+), 37 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 7914e483..50fbc586 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -976,6 +976,18 @@ async def add_agency_manual_suggestion( ): if is_new and agency_id is not None: raise ValueError("agency_id must be None when is_new is True") + + # Check if agency exists in database -- if not, add with placeholder + if agency_id is not None: + statement = select(Agency).where(Agency.agency_id == agency_id) + result = await session.execute(statement) + if len(result.all()) == 0: + agency = Agency( + agency_id=agency_id, + name=PLACEHOLDER_AGENCY_NAME + ) + await session.merge(agency) + url_agency_suggestion = UserUrlAgencySuggestion( url_id=url_id, agency_id=agency_id, diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 104402c0..febf4e35 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -3,7 +3,8 @@ from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.URLMapping import URLMapping from collector_manager.enums import URLStatus -from core.enums import RecordType +from core.enums import RecordType, SuggestionType +from helpers.DBDataCreator import BatchURLCreationInfo from tests.helpers.DBDataCreator import DBDataCreator class AnnotationSetupInfo(BaseModel): @@ -28,6 +29,26 @@ async def setup_for_get_next_url_for_annotation( ) return AnnotationSetupInfo(batch_id=batch_id, insert_urls_info=insert_urls_info) +class AnnotateAgencySetupInfo(BaseModel): + batch_id: int + url_ids: list[int] + +async def setup_for_annotate_agency( + db_data_creator: DBDataCreator, + url_count: int, + suggestion_type: SuggestionType = SuggestionType.UNKNOWN +): + buci: BatchURLCreationInfo = await db_data_creator.batch_and_urls( + url_count=url_count, + with_html_content=True + ) + await db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=suggestion_type + ) + + return AnnotateAgencySetupInfo(batch_id=buci.batch_id, url_ids=buci.url_ids) class FinalReviewSetupInfo(BaseModel): batch_id: int diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 1530dcb1..0bf4be11 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -13,6 +13,7 @@ from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import RecordType, SuggestionType +from helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency from html_tag_collector.DataClassTags import ResponseHTMLInfo from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID @@ -221,7 +222,6 @@ async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): The user should receive all of the auto suggestions with full detail """ ath = api_test_helper - adb_client = ath.adb_client() buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( url_count=1, with_html_content=True @@ -264,7 +264,6 @@ async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): The user should receive a single Unknown Auto Suggestion lacking other detail """ ath = api_test_helper - adb_client = ath.adb_client() buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( url_count=1, with_html_content=True @@ -306,7 +305,6 @@ async def test_annotate_agency_single_confirmed_agency(api_test_helper): The user should not receive this URL to annotate """ ath = api_test_helper - adb_client = ath.adb_client() buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( url_count=1, with_html_content=True @@ -325,20 +323,16 @@ async def test_annotate_agency_other_user_annotation(api_test_helper): Our user should still receive this URL to annotate """ ath = api_test_helper - adb_client = ath.adb_client() - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 ) + url_ids = setup_info.url_ids + await ath.db_data_creator.manual_suggestion( user_id=MOCK_USER_ID + 1, - url_id=buci.url_ids[0], + url_id=url_ids[0], ) response = await ath.request_validator.get_next_agency_annotation() @@ -346,7 +340,7 @@ async def test_annotate_agency_other_user_annotation(api_test_helper): assert response.next_annotation next_annotation = response.next_annotation # Check that url_id matches the one we inserted - assert next_annotation.url_id == buci.url_ids[0] + assert next_annotation.url_id == url_ids[0] # Check that html data is present assert next_annotation.html_info.description != "" @@ -364,20 +358,15 @@ async def test_annotate_agency_submit_and_get_next(api_test_helper): Until another relevant URL is added """ ath = api_test_helper - adb_client = ath.adb_client() - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=2, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=2 ) + url_ids = setup_info.url_ids # User should submit an annotation and receive the next response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=buci.url_ids[0], + url_id=url_ids[0], agency_annotation_post_info=URLAgencyAnnotationPostInfo( suggested_agency=await ath.db_data_creator.agency(), is_new=False @@ -388,7 +377,7 @@ async def test_annotate_agency_submit_and_get_next(api_test_helper): # User should submit this annotation and receive none for the next response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=buci.url_ids[1], + url_id=url_ids[1], agency_annotation_post_info=URLAgencyAnnotationPostInfo( suggested_agency=await ath.db_data_creator.agency(), is_new=False @@ -407,19 +396,15 @@ async def test_annotate_agency_submit_new(api_test_helper): """ ath = api_test_helper adb_client = ath.adb_client() - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 ) + url_ids = setup_info.url_ids # User should submit an annotation and mark it as New response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=buci.url_ids[0], + url_id=url_ids[0], agency_annotation_post_info=URLAgencyAnnotationPostInfo( suggested_agency=await ath.db_data_creator.agency(), is_new=True diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 7af9d5a2..6090aaf1 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -9,11 +9,12 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.models import URL, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency +from collector_db.constants import PLACEHOLDER_AGENCY_NAME +from collector_db.models import URL, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Agency from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.enums import BatchStatus, RecordType, SuggestionType -from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_annotation +from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_annotation, setup_for_annotate_agency from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review @@ -656,3 +657,22 @@ async def test_annotate_url_marked_not_relevant(db_data_creator: DBDataCreator): batch_id=None ) assert agency_annotation_info.next_annotation.url_id == url_to_mark_not_relevant.url_id + +@pytest.mark.asyncio +async def test_annotate_url_agency_agency_not_in_db(db_data_creator: DBDataCreator): + setup_info = await setup_for_annotate_agency( + db_data_creator, + url_count=1 + ) + + url_id = setup_info.url_ids[0] + await db_data_creator.adb_client.add_agency_manual_suggestion( + agency_id=1, + url_id=url_id, + user_id=1, + is_new=False + ) + + agencies = await db_data_creator.adb_client.get_all(Agency) + assert len(agencies) + assert agencies[0].name == PLACEHOLDER_AGENCY_NAME \ No newline at end of file From def484448b74b7f9e6ec4e6cc27dc12c99634192 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 4 Apr 2025 07:54:37 -0400 Subject: [PATCH 079/182] fix(tests): fix import bug --- tests/helpers/complex_test_data_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index febf4e35..57fd6b96 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -4,7 +4,7 @@ from collector_db.DTOs.URLMapping import URLMapping from collector_manager.enums import URLStatus from core.enums import RecordType, SuggestionType -from helpers.DBDataCreator import BatchURLCreationInfo +from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.helpers.DBDataCreator import DBDataCreator class AnnotationSetupInfo(BaseModel): From fcb9b2df57a2c33959d63dabb8839ffd823430f3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 4 Apr 2025 08:00:46 -0400 Subject: [PATCH 080/182] fix(tests): fix import bug --- tests/test_automated/integration/api/test_annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 0bf4be11..3d870371 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -13,7 +13,7 @@ from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import RecordType, SuggestionType -from helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency +from tests.helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency from html_tag_collector.DataClassTags import ResponseHTMLInfo from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID From 443e76795d462113e095d9db56085313d2575275 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 4 Apr 2025 08:39:37 -0400 Subject: [PATCH 081/182] feat(api): require final review permission for review endpoints BREAKING CHANGE: All `/review/`endpoints now require the `source_collector_final_review` permission --- api/routes/review.py | 10 ++++++---- security_manager/SecurityManager.py | 16 +++++++++++++--- tests/test_automated/integration/api/conftest.py | 12 ++++++++++-- .../security_manager/test_security_manager.py | 4 ++-- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/api/routes/review.py b/api/routes/review.py index 649e0b39..62bf5de6 100644 --- a/api/routes/review.py +++ b/api/routes/review.py @@ -7,7 +7,7 @@ from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \ GetNextURLForFinalReviewOuterResponse -from security_manager.SecurityManager import AccessInfo, get_access_info +from security_manager.SecurityManager import AccessInfo, get_access_info, require_permission, Permissions review_router = APIRouter( prefix="/review", @@ -15,10 +15,12 @@ responses={404: {"description": "Not found"}}, ) +requires_final_review_permission = require_permission(Permissions.SOURCE_COLLECTOR_FINAL_REVIEW) + @review_router.get("/next-source") async def get_next_source( core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info), + access_info: AccessInfo = Depends(requires_final_review_permission), batch_id: Optional[int] = Query( description="The batch id of the next URL to get. " "If not specified, defaults to first qualifying URL", @@ -30,7 +32,7 @@ async def get_next_source( @review_router.post("/approve-source") async def approve_source( core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info), + access_info: AccessInfo = Depends(requires_final_review_permission), approval_info: FinalReviewApprovalInfo = FinalReviewApprovalInfo, batch_id: Optional[int] = Query( description="The batch id of the next URL to get. " @@ -47,7 +49,7 @@ async def approve_source( @review_router.post("/reject-source") async def reject_source( core: AsyncCore = Depends(get_async_core), - access_info: AccessInfo = Depends(get_access_info), + access_info: AccessInfo = Depends(requires_final_review_permission), review_info: FinalReviewBaseInfo = FinalReviewBaseInfo, batch_id: Optional[int] = Query( description="The batch id of the next URL to get. " diff --git a/security_manager/SecurityManager.py b/security_manager/SecurityManager.py index 8d80f46c..92da2975 100644 --- a/security_manager/SecurityManager.py +++ b/security_manager/SecurityManager.py @@ -20,6 +20,7 @@ def get_secret_key(): class Permissions(Enum): SOURCE_COLLECTOR = "source_collector" + SOURCE_COLLECTOR_FINAL_REVIEW = "source_collector_final_review" class AccessInfo(BaseModel): user_id: int @@ -65,9 +66,13 @@ def get_relevant_permissions(raw_permissions: list[str]) -> list[Permissions]: continue return relevant_permissions - def check_access(self, token: str) -> AccessInfo: + def check_access( + self, + token: str, + permission: Permissions + ) -> AccessInfo: access_info = self.validate_token(token) - if not access_info.has_permission(Permissions.SOURCE_COLLECTOR): + if not access_info.has_permission(permission): raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail="Access forbidden", @@ -80,4 +85,9 @@ def check_access(self, token: str) -> AccessInfo: def get_access_info( token: Annotated[str, Depends(oauth2_scheme)] ) -> AccessInfo: - return SecurityManager().check_access(token) \ No newline at end of file + return SecurityManager().check_access(token, Permissions.SOURCE_COLLECTOR) + +def require_permission(permission: Permissions): + def dependency(token: Annotated[str, Depends(oauth2_scheme)]) -> AccessInfo: + return SecurityManager().check_access(token, permission=permission) + return dependency \ No newline at end of file diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index d9a504a7..a0a46abf 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -6,8 +6,9 @@ from starlette.testclient import TestClient from api.main import app +from api.routes.review import requires_final_review_permission from core.SourceCollectorCore import SourceCollectorCore -from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions +from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions, require_permission from tests.helpers.DBDataCreator import DBDataCreator from tests.test_automated.integration.api.helpers.RequestValidator import RequestValidator @@ -27,12 +28,19 @@ def adb_client(self): def override_access_info() -> AccessInfo: - return AccessInfo(user_id=MOCK_USER_ID, permissions=[Permissions.SOURCE_COLLECTOR]) + return AccessInfo( + user_id=MOCK_USER_ID, + permissions=[ + Permissions.SOURCE_COLLECTOR, + Permissions.SOURCE_COLLECTOR_FINAL_REVIEW + ] + ) @pytest.fixture def client(db_client_test) -> Generator[TestClient, None, None]: with TestClient(app) as c: app.dependency_overrides[get_access_info] = override_access_info + app.dependency_overrides[requires_final_review_permission] = override_access_info core: SourceCollectorCore = c.app.state.core # core.shutdown() yield c diff --git a/tests/test_automated/unit/security_manager/test_security_manager.py b/tests/test_automated/unit/security_manager/test_security_manager.py index f827cc1b..fd03fee5 100644 --- a/tests/test_automated/unit/security_manager/test_security_manager.py +++ b/tests/test_automated/unit/security_manager/test_security_manager.py @@ -49,7 +49,7 @@ def test_validate_token_failure(mock_get_secret_key, mock_jwt_decode): def test_check_access_success(mock_get_secret_key, mock_jwt_decode): sm = SecurityManager() - sm.check_access(VALID_TOKEN) # Should not raise any exceptions. + sm.check_access(VALID_TOKEN, Permissions.SOURCE_COLLECTOR) # Should not raise any exceptions. def test_check_access_failure(mock_get_secret_key, mock_jwt_decode): @@ -57,7 +57,7 @@ def test_check_access_failure(mock_get_secret_key, mock_jwt_decode): with patch(get_patch_path("SecurityManager.validate_token"), return_value=AccessInfo(user_id=1, permissions=[])): sm = SecurityManager() with pytest.raises(HTTPException) as exc_info: - sm.check_access(VALID_TOKEN) + sm.check_access(VALID_TOKEN, Permissions.SOURCE_COLLECTOR) assert exc_info.value.status_code == 403 From 3b3253fd39edcfaf7b600c486626b2a371fdef13 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 5 Apr 2025 08:52:36 -0400 Subject: [PATCH 082/182] feat(app): update misc metadata task to use html title description as defaults --- collector_db/AsyncDatabaseClient.py | 2 +- collector_db/StatementComposer.py | 11 +---------- core/classes/URLMiscellaneousMetadataTaskOperator.py | 12 +++++++++--- .../tasks/test_url_miscellaneous_metadata_task.py | 4 +++- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 172f8061..34ebe7f7 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -410,7 +410,7 @@ async def get_pending_urls_missing_miscellaneous_metadata( for result in all_results: tdo = URLMiscellaneousMetadataTDO( url_id=result.id, - collector_metadata=result.collector_metadata, + collector_metadata=result.collector_metadata or {}, collector_type=CollectorType(result.batch.strategy), ) html_info = URLHTMLMetadataInfo() diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index c80b83e5..b2b7e706 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -73,16 +73,7 @@ def pending_urls_missing_miscellaneous_metadata_query() -> Select: URL.outcome == URLStatus.PENDING.value, URL.name == None, URL.description == None, - URLOptionalDataSourceMetadata.url_id == None, - Batch.strategy.in_( - [ - CollectorType.AUTO_GOOGLER.value, - CollectorType.CKAN.value, - CollectorType.MUCKROCK_ALL_SEARCH.value, - CollectorType.MUCKROCK_COUNTY_SEARCH.value, - CollectorType.MUCKROCK_SIMPLE_SEARCH.value - ] - ) + URLOptionalDataSourceMetadata.url_id == None ) ).outerjoin( URLOptionalDataSourceMetadata diff --git a/core/classes/URLMiscellaneousMetadataTaskOperator.py b/core/classes/URLMiscellaneousMetadataTaskOperator.py index 4b9becdb..1cbebbc6 100644 --- a/core/classes/URLMiscellaneousMetadataTaskOperator.py +++ b/core/classes/URLMiscellaneousMetadataTaskOperator.py @@ -1,3 +1,5 @@ +from typing import Optional + from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType @@ -26,7 +28,10 @@ def task_type(self): async def meets_task_prerequisites(self): return await self.adb_client.has_pending_urls_missing_miscellaneous_metadata() - async def get_subtask(self, collector_type: CollectorType) -> MiscellaneousMetadataSubtaskBase: + async def get_subtask( + self, + collector_type: CollectorType + ) -> Optional[MiscellaneousMetadataSubtaskBase]: match collector_type: case CollectorType.MUCKROCK_SIMPLE_SEARCH: return MuckrockMiscMetadataSubtask() @@ -39,7 +44,7 @@ async def get_subtask(self, collector_type: CollectorType) -> MiscellaneousMetad case CollectorType.CKAN: return CKANMiscMetadataSubtask() case _: - raise Exception(f"Unknown collector type: {collector_type}") + return None async def html_default_logic(self, tdo: URLMiscellaneousMetadataTDO): if tdo.name is None: @@ -55,7 +60,8 @@ async def inner_task_logic(self): for tdo in tdos: subtask = await self.get_subtask(tdo.collector_type) try: - subtask.process(tdo) + if subtask is not None: + subtask.process(tdo) await self.html_default_logic(tdo) except Exception as e: error_info = URLErrorPydanticInfo( diff --git a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py index 51f57da9..818d5aef 100644 --- a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py +++ b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py @@ -84,6 +84,8 @@ async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): CollectorType.COMMON_CRAWLER, collector_metadata=None ) + # Add URL HTML + await db_data_creator.html_data([common_crawler_url_id]) # example # Check that task now meets prerequisites @@ -96,7 +98,7 @@ async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): # Check that each URL has the expected name/description and optional metadata expected_urls = { - common_crawler_url_id: (None, None), + common_crawler_url_id: ("test html content", "test description"), auto_googler_url_id: ("Test Auto Googler Title", "Test Auto Googler Snippet"), ckan_url_id: ("Test CKAN Name", "Test CKAN Description"), muckrock_simple_url_id: ("Test Muckrock Simple Title", "Test Muckrock Simple Title"), From 77c7dff9ed972b01ca679602f2f84ca97e2371c9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 8 Apr 2025 20:34:53 -0400 Subject: [PATCH 083/182] DRAFT --- ..._add_data_source_id_column_to_url_table.py | 10 +- collector_db/AsyncDatabaseClient.py | 64 ++++++- .../task_data_objects/SubmitApprovedURLTDO.py | 3 +- core/classes/SubmitApprovedURLTaskOperator.py | 32 +++- pdap_api_client/DTOs.py | 1 + pdap_api_client/PDAPClient.py | 30 ++- tests/helpers/DBDataCreator.py | 5 +- .../tasks/test_submit_approved_url_task.py | 171 ++++++++++++++++++ 8 files changed, 302 insertions(+), 14 deletions(-) create mode 100644 tests/test_automated/integration/tasks/test_submit_approved_url_task.py diff --git a/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py b/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py index 8e15dbf2..b92fe1ef 100644 --- a/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py +++ b/alembic/versions/2025_03_29_1716-33a546c93441_add_data_source_id_column_to_url_table.py @@ -1,7 +1,7 @@ """Add data source ID column to URL table Revision ID: 33a546c93441 -Revises: 5ea47dacd0ef +Revises: 45271f8fe75d Create Date: 2025-03-29 17:16:11.863064 """ @@ -13,19 +13,19 @@ # revision identifiers, used by Alembic. revision: str = '33a546c93441' -down_revision: Union[str, None] = '5ea47dacd0ef' +down_revision: Union[str, None] = '45271f8fe75d' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: op.add_column( - 'url', + 'urls', sa.Column('data_source_id', sa.Integer(), nullable=True) ) # Add unique constraint to data_source_id column - op.create_unique_constraint('uq_data_source_id', 'url', ['data_source_id']) + op.create_unique_constraint('uq_data_source_id', 'urls', ['data_source_id']) def downgrade() -> None: - op.drop_column('url', 'data_source_id') + op.drop_column('urls', 'data_source_id') diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 34ebe7f7..e74a28ec 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -15,7 +15,6 @@ from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.StatementComposer import StatementComposer from collector_db.constants import PLACEHOLDER_AGENCY_NAME from collector_db.enums import URLMetadataAttributeType, TaskType @@ -37,6 +36,7 @@ GetURLsResponseInnerInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info @@ -1337,4 +1337,64 @@ async def reject_url( url_id=url_id ) - session.add(rejecting_user_url) \ No newline at end of file + session.add(rejecting_user_url) + + @session_manager + async def has_validated_urls(self, session: AsyncSession) -> bool: + query = ( + select(URL) + .where(URL.outcome == URLStatus.VALIDATED.value) + ) + urls = await session.execute(query) + urls = urls.scalars().all() + return len(urls) > 0 + + @session_manager + async def get_validated_urls( + self, + session: AsyncSession + ) -> list[SubmitApprovedURLTDO]: + query = ( + select(URL) + .where(URL.outcome == URLStatus.VALIDATED.value) + .options( + selectinload(URL.optional_data_source_metadata), + selectinload(URL.confirmed_agencies) + ) + ) + urls = await session.execute(query) + urls = urls.scalars().all() + results: list[SubmitApprovedURLTDO] = [] + for url in urls: + agency_ids = [] + for agency in url.confirmed_agencies: + agency_ids.append(agency.agency_id) + tdo = SubmitApprovedURLTDO( + url_id=url.id, + url=url.url, + name=url.name, + agency_ids=agency_ids, + description=url.description, + record_type=url.record_type, + record_formats=url.optional_data_source_metadata.record_formats, + data_portal_type=url.optional_data_source_metadata.data_portal_type, + supplying_entity=url.optional_data_source_metadata.supplying_entity, + ) + results.append(tdo) + return results + + @session_manager + async def mark_urls_as_submitted(self, session: AsyncSession, tdos: list[SubmitApprovedURLTDO]): + for tdo in tdos: + url_id = tdo.url_id + data_source_id = tdo.data_source_id + query = ( + update(URL) + .where(URL.id == url_id) + .values( + data_source_id=data_source_id, + outcome=URLStatus.SUBMITTED.value + ) + ) + await session.execute(query) + diff --git a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py index fc6e789b..45fa7daf 100644 --- a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py +++ b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py @@ -6,9 +6,10 @@ class SubmitApprovedURLTDO(BaseModel): + url_id: int url: str record_type: RecordType - agency_id: Optional[int] + agency_ids: list[int] name: str description: str record_formats: Optional[list[str]] = None diff --git a/core/classes/SubmitApprovedURLTaskOperator.py b/core/classes/SubmitApprovedURLTaskOperator.py index 06b28a18..2a308e7c 100644 --- a/core/classes/SubmitApprovedURLTaskOperator.py +++ b/core/classes/SubmitApprovedURLTaskOperator.py @@ -1,6 +1,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType -from core.DTOs.task_data_objects.UrlHtmlTDO import UrlHtmlTDO +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO from core.classes.TaskOperatorBase import TaskOperatorBase from pdap_api_client.PDAPClient import PDAPClient @@ -23,7 +24,30 @@ async def meets_task_prerequisites(self): return await self.adb_client.has_validated_urls() async def inner_task_logic(self): - raise NotImplementedError + # Retrieve all URLs that are validated and not submitted + tdos: list[SubmitApprovedURLTDO] = await self.adb_client.get_validated_urls() - async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): - raise NotImplementedError \ No newline at end of file + # Link URLs to this task + await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) + + # Submit each URL, recording errors if they exist + error_infos: list[URLErrorPydanticInfo] = [] + success_tdos: list[SubmitApprovedURLTDO] = [] + for tdo in tdos: + try: + data_source_id = await self.pdap_client.submit_url(tdo) + tdo.data_source_id = data_source_id + success_tdos.append(tdo) + except Exception as e: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=tdo.url_id, + error=str(e), + ) + error_infos.append(error_info) + + # Update the database for successful submissions + await self.adb_client.mark_urls_as_submitted(tdos=success_tdos) + + # Update the database for failed submissions + await self.adb_client.add_url_error_infos(error_infos) diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index 19255a35..37d7e857 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -36,6 +36,7 @@ class Namespaces(Enum): AUTH = "auth" MATCH = "match" CHECK = "check" + DATA_SOURCES = "data-sources" class RequestType(Enum): diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index b2b89564..8b1c5e82 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,5 +1,6 @@ from typing import Optional +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO from pdap_api_client.AccessManager import build_url, AccessManager from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ RequestType, RequestInfo, MatchAgencyResponse @@ -21,7 +22,6 @@ async def match_agency( county: Optional[str] = None, locality: Optional[str] = None ) -> MatchAgencyResponse: - # TODO: Change to async """ Returns agencies, if any, that match or partially match the search criteria """ @@ -84,3 +84,31 @@ async def is_url_unique( is_unique=is_unique, duplicates=duplicates ) + + async def submit_url( + self, + tdo: SubmitApprovedURLTDO + ) -> int: + url = build_url( + namespace=Namespaces.DATA_SOURCES, + ) + headers = await self.access_manager.jwt_header() + request_info = RequestInfo( + type_=RequestType.POST, + url=url, + headers=headers, + json={ + "entry_data": { + "name": tdo.name, + "description": tdo.description, + "source_url": tdo.url, + "record_type_name": tdo.record_type.value, + "record_formats": tdo.record_formats, + "data_portal_type": tdo.data_portal_type, + "supplying_entity": tdo.supplying_entity + }, + "linked_agency_ids": tdo.agency_ids + } + ) + response_info = await self.access_manager.make_request(request_info) + return response_info.data["id"] diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 9f9719a7..dbf7072a 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -61,7 +61,10 @@ async def batch_and_urls( if with_html_content: await self.html_data(url_ids) - return BatchURLCreationInfo(batch_id=batch_id, url_ids=url_ids) + return BatchURLCreationInfo( + batch_id=batch_id, + url_ids=url_ids + ) async def agency(self) -> int: agency_id = randint(1, 99999999) diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py new file mode 100644 index 00000000..75630af8 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -0,0 +1,171 @@ +from http import HTTPStatus +from unittest.mock import MagicMock, AsyncMock + +import pytest + +from collector_db.models import URL +from collector_manager.enums import URLStatus +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome +from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator +from core.enums import RecordType +from helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.DTOs import RequestInfo, RequestType, ResponseInfo +from pdap_api_client.PDAPClient import PDAPClient + + +@pytest.fixture +def mock_pdap_client(): + mock_access_manager = MagicMock( + spec=AccessManager + ) + mock_access_manager.make_request = AsyncMock( + side_effect=[ + ResponseInfo( + status_code=HTTPStatus.OK, + data={ + "id": 21 + } + ), + ResponseInfo( + status_code=HTTPStatus.OK, + data={ + "id": 34 + } + ) + ] + ) + mock_access_manager.jwt_header = AsyncMock( + return_value={"Authorization": "Bearer token"} + ) + pdap_client = PDAPClient( + access_manager=mock_access_manager + ) + return pdap_client + +async def setup_validated_urls(db_data_creator: DBDataCreator): + creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls( + url_count=2, + with_html_content=True + ) + url_1 = creation_info.url_ids[0] + url_2 = creation_info.url_ids[1] + await db_data_creator.adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_1, + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=[1, 2], + name="URL 1 Name", + description="URL 1 Description", + record_formats=["Record Format 1", "Record Format 2"], + data_portal_type="Data Portal Type 1", + supplying_entity="Supplying Entity 1" + ), + user_id=1 + ) + await db_data_creator.adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_2, + record_type=RecordType.INCARCERATION_RECORDS, + agency_ids=[3, 4], + name="URL 2 Name", + description="URL 2 Description", + ), + user_id=1 + ) + +@pytest.mark.asyncio +async def test_submit_approved_url_task( + db_data_creator, + mock_pdap_client, + monkeypatch +): + monkeypatch.setenv("PDAP_API_URL", "http://localhost:8000") + + # Get Task Operator + operator = SubmitApprovedURLTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) + + # Check Task Operator does not yet meet pre-requisites + assert not await operator.meets_task_prerequisites() + + # Create URLs with status 'validated' in database and all requisite URL values + # Ensure they have optional metadata as well + await setup_validated_urls(db_data_creator) + + # Check Task Operator does meet pre-requisites + assert await operator.meets_task_prerequisites() + + # Run Task + run_info = await operator.run_task(task_id=1) + + # Check Task has been marked as completed + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message + + # Get URLs + urls = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") + url_1 = urls[0] + url_2 = urls[1] + + # Check URLs have been marked as 'submitted' + assert url_1.outcome == URLStatus.SUBMITTED.value + assert url_2.outcome == URLStatus.SUBMITTED.value + + # Check URLs now have data source ids + assert url_1.data_source_id == 21 + assert url_2.data_source_id == 34 + + # Check mock method was called twice with expected parameters + access_manager = mock_pdap_client.access_manager + assert access_manager.make_request.call_count == 2 + # Check first call + + + call_1 = access_manager.make_request.call_args_list[0][0][0] + expected_call_1 = RequestInfo( + type_=RequestType.POST, + url="http://localhost:8000/data-sources", + headers=access_manager.jwt_header.return_value, + json={ + "entry_data": { + "name": "URL 1 Name", + "source_url": url_1.url, + "record_type_name": "Accident Reports", + "description": "URL 1 Description", + "record_formats": ["Record Format 1", "Record Format 2"], + "data_portal_type": "Data Portal Type 1", + "supplying_entity": "Supplying Entity 1" + }, + "linked_agency_ids": [1, 2] + } + ) + assert call_1.type_ == expected_call_1.type_ + assert call_1.url == expected_call_1.url + assert call_1.headers == expected_call_1.headers + assert call_1.json == expected_call_1.json + # Check second call + call_2 = access_manager.make_request.call_args_list[1][0][0] + expected_call_2 = RequestInfo( + type_=RequestType.POST, + url="http://localhost:8000/data-sources", + headers=access_manager.jwt_header.return_value, + json={ + "entry_data": { + "name": "URL 2 Name", + "source_url": url_2.url, + "record_type_name": "Incarceration Records", + "description": "URL 2 Description", + "data_portal_type": None, + "supplying_entity": None, + "record_formats": None + }, + "linked_agency_ids": [3, 4] + } + ) + assert call_2.type_ == expected_call_2.type_ + assert call_2.url == expected_call_2.url + assert call_2.headers == expected_call_2.headers + assert call_2.json == expected_call_2.json \ No newline at end of file From 1b0e6c372d8b9d6f83b3cbb35439ea4d1f728440 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 8 Apr 2025 21:18:48 -0400 Subject: [PATCH 084/182] feat(app): allow retrieving URLs for annotation without html info --- collector_db/AsyncDatabaseClient.py | 1 - tests/conftest.py | 21 +++- .../AlembicRunner.py | 0 tests/helpers/complex_test_data_functions.py | 5 +- tests/test_alembic/conftest.py | 2 +- tests/test_alembic/helpers.py | 2 +- .../integration/api/test_annotate.py | 98 ++++++++++++++++++- 7 files changed, 120 insertions(+), 9 deletions(-) rename tests/{test_alembic => helpers}/AlembicRunner.py (100%) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 34ebe7f7..39dba50e 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -129,7 +129,6 @@ async def get_next_url_for_user_annotation( URL, ) .where(URL.outcome == URLStatus.PENDING.value) - .where(exists(select(URLHTMLContent).where(URLHTMLContent.url_id == URL.id))) # URL must not have metadata annotation by this user .where( not_( diff --git a/tests/conftest.py b/tests/conftest.py index 6181dd50..3e33d57a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,13 @@ import pytest from alembic import command from alembic.config import Config -from sqlalchemy import create_engine +from sqlalchemy import create_engine, inspect, MetaData +from sqlalchemy.orm import scoped_session, sessionmaker from collector_db.DatabaseClient import DatabaseClient from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base +from helpers.AlembicRunner import AlembicRunner from tests.helpers.DBDataCreator import DBDataCreator @@ -19,7 +21,22 @@ def setup_and_teardown(): "sqlalchemy.url", get_postgres_connection_string() ) - command.upgrade(alembic_cfg, "head") + live_connection = engine.connect() + runner = AlembicRunner( + alembic_config=alembic_cfg, + inspector=inspect(live_connection), + metadata=MetaData(), + connection=live_connection, + session=scoped_session(sessionmaker(bind=live_connection)), + ) + try: + runner.upgrade("head") + except Exception as e: + runner.reset_schema() + runner.stamp("base") + runner.upgrade("head") + + live_connection.close() engine.dispose() yield diff --git a/tests/test_alembic/AlembicRunner.py b/tests/helpers/AlembicRunner.py similarity index 100% rename from tests/test_alembic/AlembicRunner.py rename to tests/helpers/AlembicRunner.py diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 57fd6b96..18d3f92a 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -36,11 +36,12 @@ class AnnotateAgencySetupInfo(BaseModel): async def setup_for_annotate_agency( db_data_creator: DBDataCreator, url_count: int, - suggestion_type: SuggestionType = SuggestionType.UNKNOWN + suggestion_type: SuggestionType = SuggestionType.UNKNOWN, + with_html_content: bool = True ): buci: BatchURLCreationInfo = await db_data_creator.batch_and_urls( url_count=url_count, - with_html_content=True + with_html_content=with_html_content ) await db_data_creator.auto_suggestions( url_ids=buci.url_ids, diff --git a/tests/test_alembic/conftest.py b/tests/test_alembic/conftest.py index 11b75b92..ff0591d1 100644 --- a/tests/test_alembic/conftest.py +++ b/tests/test_alembic/conftest.py @@ -4,7 +4,7 @@ from sqlalchemy.orm import scoped_session, sessionmaker from collector_db.helper_functions import get_postgres_connection_string -from tests.test_alembic.AlembicRunner import AlembicRunner +from helpers.AlembicRunner import AlembicRunner @pytest.fixture() diff --git a/tests/test_alembic/helpers.py b/tests/test_alembic/helpers.py index d6b2bea4..32d67321 100644 --- a/tests/test_alembic/helpers.py +++ b/tests/test_alembic/helpers.py @@ -3,7 +3,7 @@ from sqlalchemy import text from sqlalchemy.orm import Session -from tests.test_alembic.AlembicRunner import AlembicRunner +from helpers.AlembicRunner import AlembicRunner def get_enum_values(enum_name: str, session: Session) -> list[str]: diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 3d870371..0e462ba5 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -28,8 +28,12 @@ def check_url_mappings_match( def check_html_info_not_empty( html_info: ResponseHTMLInfo ): - assert html_info.description != "" - assert html_info.title != "" + assert not html_info_empty(html_info) + +def html_info_empty( + html_info: ResponseHTMLInfo +) -> bool: + return html_info.description == "" and html_info.title == "" @pytest.mark.asyncio async def test_annotate_relevancy(api_test_helper): @@ -123,6 +127,36 @@ async def test_annotate_relevancy(api_test_helper): assert results[0].relevant is True +@pytest.mark.asyncio +async def test_annotate_relevancy_no_html(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) @pytest.mark.asyncio async def test_annotate_record_type(api_test_helper): @@ -213,6 +247,36 @@ async def test_annotate_record_type(api_test_helper): if result.url_id == inner_info_1.url_info.url_id: assert result.record_type == RecordType.BOOKING_REPORTS.value +@pytest.mark.asyncio +async def test_annotate_record_type_no_html_info(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) @pytest.mark.asyncio async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): @@ -256,6 +320,36 @@ async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): assert agency_suggestion.locality is not None +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=False + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_id == buci.url_ids[0] + + # Check that html data is not present + assert next_annotation.html_info.description == "" + assert next_annotation.html_info.title == "" + @pytest.mark.asyncio async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): """ From 27581eb9abd422cdd5effdf5e2e98367b75c50a9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 8 Apr 2025 21:23:27 -0400 Subject: [PATCH 085/182] Fix import bug --- tests/test_alembic/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_alembic/conftest.py b/tests/test_alembic/conftest.py index ff0591d1..8cd1d0ab 100644 --- a/tests/test_alembic/conftest.py +++ b/tests/test_alembic/conftest.py @@ -4,7 +4,7 @@ from sqlalchemy.orm import scoped_session, sessionmaker from collector_db.helper_functions import get_postgres_connection_string -from helpers.AlembicRunner import AlembicRunner +from tests.helpers.AlembicRunner import AlembicRunner @pytest.fixture() From 0ba8dc14ff2907509fdb0a0c0b6e3e82a1fe74d8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 8 Apr 2025 21:26:03 -0400 Subject: [PATCH 086/182] Fix import bug --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3e33d57a..7cc4291c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from collector_db.DatabaseClient import DatabaseClient from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base -from helpers.AlembicRunner import AlembicRunner +from tests.helpers.AlembicRunner import AlembicRunner from tests.helpers.DBDataCreator import DBDataCreator From 3275fe38af971bf555d502da09d20f9f3588dffe Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 8 Apr 2025 21:34:21 -0400 Subject: [PATCH 087/182] Fix import bug --- tests/test_alembic/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_alembic/helpers.py b/tests/test_alembic/helpers.py index 32d67321..dfebce07 100644 --- a/tests/test_alembic/helpers.py +++ b/tests/test_alembic/helpers.py @@ -3,7 +3,7 @@ from sqlalchemy import text from sqlalchemy.orm import Session -from helpers.AlembicRunner import AlembicRunner +from tests.helpers.AlembicRunner import AlembicRunner def get_enum_values(enum_name: str, session: Session) -> list[str]: From 753a06dd66dbc6cea64e32b458f00dcd912b5bd0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 9 Apr 2025 20:57:37 -0400 Subject: [PATCH 088/182] Temporarily disable HTML Task Operator --- core/AsyncCore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 28a14fa2..f8c42815 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -107,7 +107,7 @@ async def get_url_miscellaneous_metadata_task_operator(self): async def get_task_operators(self) -> list[TaskOperatorBase]: return [ - await self.get_url_html_task_operator(), + # await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), From 87c1057b5904d4bd04967ed533323ce955206f53 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 10 Apr 2025 07:45:44 -0400 Subject: [PATCH 089/182] Re-enable HTML Task Operator with logging on fetch_and_render --- html_tag_collector/URLRequestInterface.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index 20ea1989..f55d9502 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -46,6 +46,7 @@ async def get_response(self, session: ClientSession, url: str) -> URLResponseInf return URLResponseInfo(success=False, exception=e) async def fetch_and_render(self, rr: RequestResources, url: str) -> URLResponseInfo: + print(f"Fetch and Rendering {url}") simple_response = await self.get_response(rr.session, url) if not simple_response.success: return simple_response From 0b7661e13cb8f3acc051bc5a77b6c50caed0762e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 10 Apr 2025 07:51:56 -0400 Subject: [PATCH 090/182] Re-enable HTML Task Operator with logging on fetch_and_render --- core/AsyncCore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index f8c42815..28a14fa2 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -107,7 +107,7 @@ async def get_url_miscellaneous_metadata_task_operator(self): async def get_task_operators(self) -> list[TaskOperatorBase]: return [ - # await self.get_url_html_task_operator(), + await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), From c3a8511c25bda632ff7c961d457937ee0c53e2da Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 10 Apr 2025 08:03:24 -0400 Subject: [PATCH 091/182] Transition relevancy pipeline to lazy loading --- hugging_face/HuggingFaceInterface.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hugging_face/HuggingFaceInterface.py b/hugging_face/HuggingFaceInterface.py index 4e37e9c4..87d88caf 100644 --- a/hugging_face/HuggingFaceInterface.py +++ b/hugging_face/HuggingFaceInterface.py @@ -1,12 +1,13 @@ from transformers import pipeline from collector_db.DTOs.URLWithHTML import URLWithHTML - +import gc class HuggingFaceInterface: - def __init__(self): - self.relevance_pipe = pipeline("text-classification", model="PDAP/url-relevance") + @staticmethod + def load_relevancy_model() -> pipeline: + return pipeline("text-classification", model="PDAP/url-relevance") def get_url_relevancy( self, @@ -14,7 +15,8 @@ def get_url_relevancy( threshold: float = 0.5 ) -> list[bool]: urls = [url_with_html.url for url_with_html in urls_with_html] - results: list[dict] = self.relevance_pipe(urls) + relevance_pipe = self.load_relevancy_model() + results: list[dict] = relevance_pipe(urls) bool_results = [] for result in results: @@ -23,6 +25,8 @@ def get_url_relevancy( bool_results.append(True) else: bool_results.append(False) + del relevance_pipe + gc.collect() return bool_results From eba18d1f16ee7febede84ceb4f4d092afc0ae33e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 10 Apr 2025 08:05:40 -0400 Subject: [PATCH 092/182] Remove log for fetch and render --- html_tag_collector/URLRequestInterface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index f55d9502..20ea1989 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -46,7 +46,6 @@ async def get_response(self, session: ClientSession, url: str) -> URLResponseInf return URLResponseInfo(success=False, exception=e) async def fetch_and_render(self, rr: RequestResources, url: str) -> URLResponseInfo: - print(f"Fetch and Rendering {url}") simple_response = await self.get_response(rr.session, url) if not simple_response.success: return simple_response From d68ab306c7fcd4ddf648e60a5626280fc2df1f7c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 10 Apr 2025 14:46:31 -0400 Subject: [PATCH 093/182] feat(app): enable task loop to repeat if prerequisites met The task loop has been modified such that, if prerequisites continue to be met, the same task loop will run again. In the case of the task looping more than 20 times, the task loop is set to break and discord notified as an indicator of potentially unwelcome activity. --- ENV.md | 1 + api/main.py | 4 ++ core/AsyncCore.py | 34 +++++++---- .../integration/api/conftest.py | 3 +- .../integration/core/test_async_core.py | 59 ++++++++++++++++--- .../security_manager/test_security_manager.py | 8 ++- util/DiscordNotifier.py | 13 ++++ 7 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 util/DiscordNotifier.py diff --git a/ENV.md b/ENV.md index 68359348..cdedd288 100644 --- a/ENV.md +++ b/ENV.md @@ -21,4 +21,5 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`PDAP_EMAIL`| An email address for accessing the PDAP API. | `abc123@test.com` | |`PDAP_PASSWORD`| A password for accessing the PDAP API. | `abc123` | |`PDAP_API_KEY`| An API key for accessing the PDAP API. | `abc123` | +|`DISCORD_WEBHOOK_URL`| The URL for the Discord webhook used for notifications| `abc123` | diff --git a/api/main.py b/api/main.py index 8feaa165..f39cc7f3 100644 --- a/api/main.py +++ b/api/main.py @@ -20,6 +20,7 @@ from html_tag_collector.RootURLCache import RootURLCache from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface +from util.DiscordNotifier import DiscordPoster from util.helper_functions import get_from_env @@ -40,6 +41,9 @@ async def lifespan(app: FastAPI): url_request_interface=URLRequestInterface(), html_parser=HTMLResponseParser( root_url_cache=RootURLCache() + ), + discord_poster=DiscordPoster( + webhook_url=get_from_env("DISCORD_WEBHOOK_URL") ) ) async_scheduled_task_manager = AsyncScheduledTaskManager(async_core=async_core) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 28a14fa2..d95efbfe 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,22 +1,18 @@ import logging from typing import Optional -from aiohttp import ClientSession from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo -from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo -from collector_db.enums import TaskType, URLMetadataAttributeType +from collector_db.enums import TaskType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo -from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo -from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase @@ -24,8 +20,7 @@ from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator -from core.enums import BatchStatus, SuggestionType, RecordType -from html_tag_collector.DataClassTags import convert_to_response_html_info +from core.enums import BatchStatus, RecordType from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface @@ -33,8 +28,10 @@ from pdap_api_client.AccessManager import AccessManager from pdap_api_client.PDAPClient import PDAPClient from security_manager.SecurityManager import AccessInfo +from util.DiscordNotifier import DiscordPoster from util.helper_functions import get_from_env +TASK_REPEAT_THRESHOLD = 20 class AsyncCore: @@ -44,6 +41,7 @@ def __init__( huggingface_interface: HuggingFaceInterface, url_request_interface: URLRequestInterface, html_parser: HTMLResponseParser, + discord_poster: DiscordPoster ): self.adb_client = adb_client self.huggingface_interface = huggingface_interface @@ -52,6 +50,7 @@ def __init__( self.logger = logging.getLogger(__name__) self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) + self.discord_poster = discord_poster async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: @@ -119,14 +118,23 @@ async def get_task_operators(self) -> list[TaskOperatorBase]: #region Tasks async def run_tasks(self): operators = await self.get_task_operators() + count = 0 for operator in operators: + meets_prereq = await operator.meets_task_prerequisites() - if not meets_prereq: - self.logger.info(f"Skipping {operator.task_type.value} Task") - continue - task_id = await self.initiate_task_in_db(task_type=operator.task_type) - run_info: TaskOperatorRunInfo = await operator.run_task(task_id) - await self.conclude_task(run_info) + while meets_prereq: + if count > TASK_REPEAT_THRESHOLD: + self.discord_poster.post_to_discord( + message=f"Task {operator.task_type.value} has been run" + f" more than {TASK_REPEAT_THRESHOLD} times in a row. " + f"Task loop terminated.") + break + task_id = await self.initiate_task_in_db(task_type=operator.task_type) + run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + await self.conclude_task(run_info) + count += 1 + meets_prereq = await operator.meets_task_prerequisites() + async def conclude_task(self, run_info): await self.adb_client.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index d9a504a7..2065463e 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -30,7 +30,8 @@ def override_access_info() -> AccessInfo: return AccessInfo(user_id=MOCK_USER_ID, permissions=[Permissions.SOURCE_COLLECTOR]) @pytest.fixture -def client(db_client_test) -> Generator[TestClient, None, None]: +def client(db_client_test, monkeypatch) -> Generator[TestClient, None, None]: + monkeypatch.setenv("DISCORD_WEBHOOK_URL", "https://discord.com") with TestClient(app) as c: app.dependency_overrides[get_access_info] = override_access_info core: SourceCollectorCore = c.app.state.core diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index 1bb09809..4aa51b77 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -1,5 +1,5 @@ import types -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import MagicMock, AsyncMock, call import pytest @@ -27,7 +27,8 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): adb_client=ddc.adb_client, huggingface_interface=MagicMock(), url_request_interface=MagicMock(), - html_parser=MagicMock() + html_parser=MagicMock(), + discord_poster=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -53,7 +54,8 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): adb_client=ddc.adb_client, huggingface_interface=MagicMock(), url_request_interface=MagicMock(), - html_parser=MagicMock() + html_parser=MagicMock(), + discord_poster=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -80,7 +82,8 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): adb_client=ddc.adb_client, huggingface_interface=MagicMock(), url_request_interface=MagicMock(), - html_parser=MagicMock() + html_parser=MagicMock(), + discord_poster=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -96,7 +99,8 @@ async def test_run_task_prereq_not_met(): adb_client=AsyncMock(), huggingface_interface=AsyncMock(), url_request_interface=AsyncMock(), - html_parser=AsyncMock() + html_parser=AsyncMock(), + discord_poster=MagicMock() ) mock_operator = AsyncMock() @@ -121,19 +125,22 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: adb_client=db_data_creator.adb_client, huggingface_interface=AsyncMock(), url_request_interface=AsyncMock(), - html_parser=AsyncMock() + html_parser=AsyncMock(), + discord_poster=MagicMock() ) core.conclude_task = AsyncMock() mock_operator = AsyncMock() - mock_operator.meets_task_prerequisites = AsyncMock(return_value=True) + mock_operator.meets_task_prerequisites = AsyncMock( + side_effect=[True, False] + ) mock_operator.task_type = TaskType.HTML mock_operator.run_task = types.MethodType(run_task, mock_operator) AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) await core.run_tasks() - mock_operator.meets_task_prerequisites.assert_called_once() + mock_operator.meets_task_prerequisites.assert_has_calls([call(), call()]) results = await db_data_creator.adb_client.get_all(Task) @@ -142,3 +149,39 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: core.conclude_task.assert_called_once() +@pytest.mark.asyncio +async def test_run_task_break_loop(db_data_creator: DBDataCreator): + """ + If the task loop for a single task runs more than 20 times in a row, + this is considered suspicious and possibly indicative of a bug. + In this case, the task loop should be terminated + and an alert should be sent to discord + """ + + async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + return TaskOperatorRunInfo( + task_id=task_id, + outcome=TaskOperatorOutcome.SUCCESS, + linked_url_ids=[1, 2, 3] + ) + + core = AsyncCore( + adb_client=db_data_creator.adb_client, + huggingface_interface=AsyncMock(), + url_request_interface=AsyncMock(), + html_parser=AsyncMock(), + discord_poster=MagicMock() + ) + core.conclude_task = AsyncMock() + + mock_operator = AsyncMock() + mock_operator.meets_task_prerequisites = AsyncMock(return_value=True) + mock_operator.task_type = TaskType.HTML + mock_operator.run_task = types.MethodType(run_task, mock_operator) + + AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) + await core.run_tasks() + + core.discord_poster.post_to_discord.assert_called_once_with( + message="Task HTML has been run more than 20 times in a row. Task loop terminated." + ) diff --git a/tests/test_automated/integration/security_manager/test_security_manager.py b/tests/test_automated/integration/security_manager/test_security_manager.py index 3dc676ad..eb7e8506 100644 --- a/tests/test_automated/integration/security_manager/test_security_manager.py +++ b/tests/test_automated/integration/security_manager/test_security_manager.py @@ -18,12 +18,16 @@ def mock_get_secret_key(mocker): VALID_TOKEN = "valid_token" INVALID_TOKEN = "invalid_token" FAKE_PAYLOAD = { - "sub": 1, + "sub": "1", "permissions": [Permissions.SOURCE_COLLECTOR.value] } -def test_api_with_valid_token(mock_get_secret_key): +def test_api_with_valid_token( + mock_get_secret_key, + monkeypatch +): + monkeypatch.setenv("DISCORD_WEBHOOK_URL", "https://discord.com") token = jwt.encode(FAKE_PAYLOAD, SECRET_KEY, algorithm=ALGORITHM) # Create Test Client diff --git a/util/DiscordNotifier.py b/util/DiscordNotifier.py new file mode 100644 index 00000000..15e74020 --- /dev/null +++ b/util/DiscordNotifier.py @@ -0,0 +1,13 @@ +import logging + +import requests + + +class DiscordPoster: + def __init__(self, webhook_url: str): + if not webhook_url: + logging.error("WEBHOOK_URL environment variable not set") + raise ValueError("WEBHOOK_URL environment variable not set") + self.webhook_url = webhook_url + def post_to_discord(self, message): + requests.post(self.webhook_url, json={"content": message}) From e2575af65f462f27e47ed6b71ca04e0169fb7b8d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 12 Apr 2025 07:47:59 -0400 Subject: [PATCH 094/182] DRAFT --- api/main.py | 19 ++- api/routes/batch.py | 27 +-- api/routes/collector.py | 11 +- collector_db/AsyncDatabaseClient.py | 125 +++++++++++++- collector_manager/AsyncCollectorBase.py | 124 ++++++++++++++ collector_manager/AsyncCollectorManager.py | 84 ++++++++++ collector_manager/CollectorManager.py | 24 +-- collector_manager/ExampleCollector.py | 13 +- collector_manager/constants.py | 14 ++ core/AsyncCore.py | 69 +++++++- core/SourceCollectorCore.py | 13 +- requirements.txt | 15 +- source_collectors/auto_googler/AutoGoogler.py | 6 +- .../auto_googler/AutoGooglerCollector.py | 15 +- .../auto_googler/GoogleSearcher.py | 29 ++-- source_collectors/ckan/CKANAPIInterface.py | 63 ++++--- source_collectors/ckan/CKANCollector.py | 29 ++-- .../ckan/ckan_scraper_toolkit.py | 84 +++++----- source_collectors/ckan/main.py | 8 +- .../ckan/scrape_ckan_data_portals.py | 28 ++-- .../common_crawler/CommonCrawler.py | 78 +++++---- .../muckrock_fetchers/MuckrockFetcher.py | 32 +--- .../MuckrockIterFetcherBase.py | 13 +- .../lifecycle/test_auto_googler_lifecycle.py | 1 + .../source_collectors/test_ckan_collector.py | 17 +- .../test_common_crawler_collector.py | 1 + .../integration/api/conftest.py | 2 + .../integration/api/test_batch.py | 1 + .../integration/api/test_example_collector.py | 12 +- tests/test_automated/integration/conftest.py | 30 ++++ .../integration/core/test_async_core.py | 15 +- .../core/test_example_collector_lifecycle.py | 27 ++- .../test_example_collector.py | 45 ----- .../test_collector_manager.py | 154 ------------------ .../test_autogoogler_collector.py | 16 +- .../test_example_collector.py | 2 +- 36 files changed, 791 insertions(+), 455 deletions(-) create mode 100644 collector_manager/AsyncCollectorBase.py create mode 100644 collector_manager/AsyncCollectorManager.py create mode 100644 collector_manager/constants.py diff --git a/api/main.py b/api/main.py index f39cc7f3..a38ead34 100644 --- a/api/main.py +++ b/api/main.py @@ -1,6 +1,7 @@ from contextlib import asynccontextmanager import uvicorn +from adodbapi.ado_consts import adBSTR from fastapi import FastAPI from api.routes.annotate import annotate_router @@ -12,6 +13,8 @@ from api.routes.url import url_router from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient +from collector_manager.AsyncCollectorManager import AsyncCollectorManager +from collector_manager.CollectorManager import CollectorManager from core.AsyncCore import AsyncCore from core.CoreLogger import CoreLogger from core.ScheduledTaskManager import AsyncScheduledTaskManager @@ -28,15 +31,26 @@ async def lifespan(app: FastAPI): # Initialize shared dependencies db_client = DatabaseClient() + adb_client = AsyncDatabaseClient() await setup_database(db_client) + core_logger = CoreLogger(db_client=db_client) + collector_manager = CollectorManager( + logger=core_logger, + db_client=db_client, + ) + async_collector_manager = AsyncCollectorManager( + logger=core_logger, + adb_client=adb_client, + ) source_collector_core = SourceCollectorCore( core_logger=CoreLogger( db_client=db_client ), db_client=DatabaseClient(), + collector_manager=collector_manager ) async_core = AsyncCore( - adb_client=AsyncDatabaseClient(), + adb_client=adb_client, huggingface_interface=HuggingFaceInterface(), url_request_interface=URLRequestInterface(), html_parser=HTMLResponseParser( @@ -44,7 +58,8 @@ async def lifespan(app: FastAPI): ), discord_poster=DiscordPoster( webhook_url=get_from_env("DISCORD_WEBHOOK_URL") - ) + ), + collector_manager=async_collector_manager ) async_scheduled_task_manager = AsyncScheduledTaskManager(async_core=async_core) diff --git a/api/routes/batch.py b/api/routes/batch.py index 9405fec6..950b6931 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -1,11 +1,13 @@ from typing import Optional -from fastapi import Path, APIRouter +from fastapi import Path, APIRouter, HTTPException from fastapi.params import Query, Depends -from api.dependencies import get_core +from api.dependencies import get_core, get_async_core from collector_db.DTOs.BatchInfo import BatchInfo +from collector_manager.CollectorManager import InvalidCollectorError from collector_manager.enums import CollectorType +from core.AsyncCore import AsyncCore from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse @@ -46,24 +48,25 @@ def get_batch_status( @batch_router.get("/{batch_id}") -def get_batch_info( +async def get_batch_info( batch_id: int = Path(description="The batch id"), - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> BatchInfo: - return core.get_batch_info(batch_id) + result = await core.get_batch_info(batch_id) + return result @batch_router.get("/{batch_id}/urls") -def get_urls_by_batch( +async def get_urls_by_batch( batch_id: int = Path(description="The batch id"), page: int = Query( description="The page number", default=1 ), - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> GetURLsByBatchResponse: - return core.get_urls_by_batch(batch_id, page=page) + return await core.get_urls_by_batch(batch_id, page=page) @batch_router.get("/{batch_id}/duplicates") def get_duplicates_by_batch( @@ -90,9 +93,13 @@ def get_batch_logs( return core.get_batch_logs(batch_id) @batch_router.post("/{batch_id}/abort") -def abort_batch( +async def abort_batch( batch_id: int = Path(description="The batch id"), core: SourceCollectorCore = Depends(get_core), + async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> MessageResponse: - return core.abort_batch(batch_id) \ No newline at end of file + try: + return core.abort_batch(batch_id) + except InvalidCollectorError as e: + return await async_core.abort_batch(batch_id) \ No newline at end of file diff --git a/api/routes/collector.py b/api/routes/collector.py index b49d569c..18c488b8 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -1,9 +1,10 @@ from fastapi import APIRouter from fastapi.params import Depends -from api.dependencies import get_core +from api.dependencies import get_core, get_async_core from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType +from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.SourceCollectorCore import SourceCollectorCore from security_manager.SecurityManager import AccessInfo, get_access_info @@ -22,13 +23,13 @@ @collector_router.post("/example") async def start_example_collector( dto: ExampleInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the example collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.EXAMPLE, dto=dto, user_id=access_info.user_id @@ -67,13 +68,13 @@ async def start_common_crawler_collector( @collector_router.post("/auto-googler") async def start_auto_googler_collector( dto: AutoGooglerInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the auto googler collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.AUTO_GOOGLER, dto=dto, user_id=access_info.user_id diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 39dba50e..60fdcdfe 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,8 +1,9 @@ from functools import wraps -from typing import Optional, Type, Any +from typing import Optional, Type, Any, List from fastapi import HTTPException from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute from sqlalchemy.sql.functions import coalesce @@ -10,6 +11,9 @@ from collector_db.ConfigManager import ConfigManager from collector_db.DTOConverter import DTOConverter +from collector_db.DTOs.BatchInfo import BatchInfo +from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo +from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType @@ -23,7 +27,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency + UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate from collector_manager.enums import URLStatus, CollectorType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo @@ -1336,4 +1340,119 @@ async def reject_url( url_id=url_id ) - session.add(rejecting_user_url) \ No newline at end of file + session.add(rejecting_user_url) + + @session_manager + async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchInfo]: + """Retrieve a batch by ID.""" + query = Select(Batch).where(Batch.id == batch_id) + result = await session.execute(query) + batch = result.scalars().first() + return BatchInfo(**batch.__dict__) + + @session_manager + async def get_urls_by_batch(self, session, batch_id: int, page: int = 1) -> List[URLInfo]: + """Retrieve all URLs associated with a batch.""" + query = Select(URL).where(URL.batch_id == batch_id).order_by(URL.id).limit(100).offset((page - 1) * 100) + result = await session.execute(query) + urls = result.scalars().all() + return ([URLInfo(**url.__dict__) for url in urls]) + + @session_manager + async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: + """Insert a new URL into the database.""" + url_entry = URL( + batch_id=url_info.batch_id, + url=url_info.url, + collector_metadata=url_info.collector_metadata, + outcome=url_info.outcome.value + ) + session.add(url_entry) + await session.flush() + return url_entry.id + + @session_manager + async def get_url_info_by_url(self, session: AsyncSession, url: str) -> Optional[URLInfo]: + query = Select(URL).where(URL.url == url) + raw_result = await session.execute(query) + url = raw_result.scalars().first() + return URLInfo(**url.__dict__) + + @session_manager + async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]): + for duplicate_info in duplicate_infos: + duplicate = Duplicate( + batch_id=duplicate_info.duplicate_batch_id, + original_url_id=duplicate_info.original_url_id, + ) + session.add(duplicate) + + @session_manager + async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> int: + """Insert a new batch into the database and return its ID.""" + batch = Batch( + strategy=batch_info.strategy, + user_id=batch_info.user_id, + status=batch_info.status.value, + parameters=batch_info.parameters, + total_url_count=batch_info.total_url_count, + original_url_count=batch_info.original_url_count, + duplicate_url_count=batch_info.duplicate_url_count, + compute_time=batch_info.compute_time, + strategy_success_rate=batch_info.strategy_success_rate, + metadata_success_rate=batch_info.metadata_success_rate, + agency_match_rate=batch_info.agency_match_rate, + record_type_match_rate=batch_info.record_type_match_rate, + record_category_match_rate=batch_info.record_category_match_rate, + ) + session.add(batch) + await session.flush() + return batch.id + + + async def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: + url_mappings = [] + duplicates = [] + for url_info in url_infos: + url_info.batch_id = batch_id + try: + url_id = await self.insert_url(url_info) + url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) + except IntegrityError: + orig_url_info = await self.get_url_info_by_url(url_info.url) + duplicate_info = DuplicateInsertInfo( + duplicate_batch_id=batch_id, + original_url_id=orig_url_info.id + ) + duplicates.append(duplicate_info) + await self.insert_duplicates(duplicates) + + return InsertURLsInfo( + url_mappings=url_mappings, + total_count=len(url_infos), + original_count=len(url_mappings), + duplicate_count=len(duplicates), + url_ids=[url_mapping.url_id for url_mapping in url_mappings] + ) + + @session_manager + async def update_batch_post_collection( + self, + session, + batch_id: int, + total_url_count: int, + original_url_count: int, + duplicate_url_count: int, + batch_status: BatchStatus, + compute_time: float = None, + ): + + query = Select(Batch).where(Batch.id == batch_id) + result = await session.execute(query) + batch = result.scalars().first() + + batch.total_url_count = total_url_count + batch.original_url_count = original_url_count + batch.duplicate_url_count = duplicate_url_count + batch.status = batch_status.value + batch.compute_time = compute_time diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py new file mode 100644 index 00000000..672d9d9c --- /dev/null +++ b/collector_manager/AsyncCollectorBase.py @@ -0,0 +1,124 @@ +import abc +import asyncio +import time +from abc import ABC +from typing import Type, Optional + +from pydantic import BaseModel + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo +from collector_db.DTOs.LogInfo import LogInfo +from collector_manager.enums import CollectorType +from core.CoreLogger import CoreLogger +from core.enums import BatchStatus +from core.preprocessors.PreprocessorBase import PreprocessorBase + + +class AsyncCollectorBase(ABC): + collector_type: CollectorType = None + preprocessor: Type[PreprocessorBase] = None + + + def __init__( + self, + batch_id: int, + dto: BaseModel, + logger: CoreLogger, + adb_client: AsyncDatabaseClient, + raise_error: bool = False + ) -> None: + self.batch_id = batch_id + self.adb_client = adb_client + self.dto = dto + self.data: Optional[BaseModel] = None + self.logger = logger + self.status = BatchStatus.IN_PROCESS + self.start_time = None + self.compute_time = None + self.raise_error = raise_error + + @abc.abstractmethod + async def run_implementation(self) -> None: + """ + This is the method that will be overridden by each collector + No other methods should be modified except for this one. + However, in each inherited class, new methods in addition to this one can be created + Returns: + + """ + raise NotImplementedError + + async def start_timer(self) -> None: + self.start_time = time.time() + + async def stop_timer(self) -> None: + self.compute_time = time.time() - self.start_time + + async def handle_error(self, e: Exception) -> None: + if self.raise_error: + raise e + await self.log(f"Error: {e}") + await self.adb_client.update_batch_post_collection( + batch_id=self.batch_id, + batch_status=self.status, + compute_time=self.compute_time, + total_url_count=0, + original_url_count=0, + duplicate_url_count=0 + ) + + async def process(self) -> None: + await self.log("Processing collector...", allow_abort=False) + preprocessor = self.preprocessor() + url_infos = preprocessor.preprocess(self.data) + await self.log(f"URLs processed: {len(url_infos)}", allow_abort=False) + + await self.log("Inserting URLs...", allow_abort=False) + insert_urls_info: InsertURLsInfo = await self.adb_client.insert_urls( + url_infos=url_infos, + batch_id=self.batch_id + ) + await self.log("Updating batch...", allow_abort=False) + await self.adb_client.update_batch_post_collection( + batch_id=self.batch_id, + total_url_count=insert_urls_info.total_count, + duplicate_url_count=insert_urls_info.duplicate_count, + original_url_count=insert_urls_info.original_count, + batch_status=self.status, + compute_time=self.compute_time + ) + await self.log("Done processing collector.", allow_abort=False) + + async def run(self) -> None: + try: + await self.start_timer() + await self.run_implementation() + await self.stop_timer() + await self.log("Collector completed successfully.") + await self.close() + await self.process() + except asyncio.CancelledError: + await self.stop_timer() + self.status = BatchStatus.ABORTED + await self.adb_client.update_batch_post_collection( + batch_id=self.batch_id, + batch_status=BatchStatus.ABORTED, + compute_time=self.compute_time, + total_url_count=0, + original_url_count=0, + duplicate_url_count=0 + ) + except Exception as e: + await self.stop_timer() + self.status = BatchStatus.ERROR + await self.handle_error(e) + + async def log(self, message: str, allow_abort = True) -> None: + self.logger.log(LogInfo( + batch_id=self.batch_id, + log=message + )) + + async def close(self) -> None: + self.status = BatchStatus.COMPLETE diff --git a/collector_manager/AsyncCollectorManager.py b/collector_manager/AsyncCollectorManager.py new file mode 100644 index 00000000..ecce57b6 --- /dev/null +++ b/collector_manager/AsyncCollectorManager.py @@ -0,0 +1,84 @@ +import asyncio +from http import HTTPStatus +from typing import Dict + +from fastapi import HTTPException +from pydantic import BaseModel + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_manager.CollectorBase import CollectorBase +from collector_manager.CollectorManager import InvalidCollectorError +from collector_manager.collector_mapping import COLLECTOR_MAPPING +from collector_manager.enums import CollectorType +from core.CoreLogger import CoreLogger + + +class AsyncCollectorManager: + + def __init__( + self, + logger: CoreLogger, + adb_client: AsyncDatabaseClient, + dev_mode: bool = False + ): + self.collectors: Dict[int, CollectorBase] = {} + self.adb_client = adb_client + self.logger = logger + self.async_tasks: dict[int, asyncio.Task] = {} + self.dev_mode = dev_mode + + async def has_collector(self, cid: int) -> bool: + return cid in self.collectors + + async def start_async_collector( + self, + collector_type: CollectorType, + batch_id: int, + dto: BaseModel + ) -> None: + if batch_id in self.collectors: + raise ValueError(f"Collector with batch_id {batch_id} is already running.") + try: + collector_class = COLLECTOR_MAPPING[collector_type] + collector = collector_class( + batch_id=batch_id, + dto=dto, + logger=self.logger, + adb_client=self.adb_client, + raise_error=True if self.dev_mode else False + ) + except KeyError: + raise InvalidCollectorError(f"Collector {collector_type.value} not found.") + + self.collectors[batch_id] = collector + + task = asyncio.create_task(collector.run()) + self.async_tasks[batch_id] = task + + def try_getting_collector(self, cid): + collector = self.collectors.get(cid) + if collector is None: + raise InvalidCollectorError(f"Collector with CID {cid} not found.") + return collector + + async def abort_collector_async(self, cid: int) -> None: + task = self.async_tasks.get(cid) + if not task: + raise HTTPException(status_code=HTTPStatus.OK, detail="Task not found") + if task is not None: + task.cancel() + try: + await task # Await so cancellation propagates + except asyncio.CancelledError: + pass + + self.async_tasks.pop(cid) + + async def shutdown_all_collectors(self) -> None: + for cid, task in self.async_tasks.items(): + if task.done(): + try: + task.result() + except Exception as e: + raise e + await self.abort_collector_async(cid) \ No newline at end of file diff --git a/collector_manager/CollectorManager.py b/collector_manager/CollectorManager.py index 658b20a8..e37b47c5 100644 --- a/collector_manager/CollectorManager.py +++ b/collector_manager/CollectorManager.py @@ -3,12 +3,16 @@ Can start, stop, and get info on running collectors And manages the retrieval of collector info """ +import asyncio import threading from concurrent.futures import Future, ThreadPoolExecutor +from http import HTTPStatus from typing import Dict, List +from fastapi import HTTPException from pydantic import BaseModel +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from collector_manager.CollectorBase import CollectorBase from collector_manager.collector_mapping import COLLECTOR_MAPPING @@ -38,12 +42,13 @@ def __init__( self.dev_mode = dev_mode self.executor = ThreadPoolExecutor(max_workers=self.max_workers) + async def has_collector(self, cid: int) -> bool: + return cid in self.collectors + + def restart_executor(self): self.executor = ThreadPoolExecutor(max_workers=self.max_workers) - def list_collectors(self) -> List[str]: - return [cm.value for cm in list(COLLECTOR_MAPPING.keys())] - def start_collector( self, collector_type: CollectorType, @@ -73,18 +78,6 @@ def start_collector( future = self.executor.submit(collector.run) self.futures[batch_id] = future - # thread = threading.Thread(target=collector.run) - # self.threads[batch_id] = thread - # thread.start() - - def get_info(self, cid: str) -> str: - collector = self.collectors.get(cid) - if not collector: - return f"Collector with CID {cid} not found." - logs = "\n".join(collector.logs[-3:]) # Show the last 3 logs - return f"{cid} ({collector.name}) - {collector.status}\nLogs:\n{logs}" - - def try_getting_collector(self, cid): collector = self.collectors.get(cid) if collector is None: @@ -93,6 +86,7 @@ def try_getting_collector(self, cid): def abort_collector(self, cid: int) -> None: collector = self.try_getting_collector(cid) + # Get collector thread thread = self.threads.get(cid) future = self.futures.get(cid) diff --git a/collector_manager/ExampleCollector.py b/collector_manager/ExampleCollector.py index c5c2a69c..2d54eced 100644 --- a/collector_manager/ExampleCollector.py +++ b/collector_manager/ExampleCollector.py @@ -3,27 +3,28 @@ Exists as a proof of concept for collector functionality """ +import asyncio import time -from collector_manager.CollectorBase import CollectorBase +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.DTOs.ExampleOutputDTO import ExampleOutputDTO from collector_manager.enums import CollectorType from core.preprocessors.ExamplePreprocessor import ExamplePreprocessor -class ExampleCollector(CollectorBase): +class ExampleCollector(AsyncCollectorBase): collector_type = CollectorType.EXAMPLE preprocessor = ExamplePreprocessor - def run_implementation(self) -> None: + async def run_implementation(self) -> None: dto: ExampleInputDTO = self.dto sleep_time = dto.sleep_time for i in range(sleep_time): # Simulate a task - self.log(f"Step {i + 1}/{sleep_time}") - time.sleep(1) # Simulate work + await self.log(f"Step {i + 1}/{sleep_time}") + await asyncio.sleep(1) # Simulate work self.data = ExampleOutputDTO( message=f"Data collected by {self.batch_id}", urls=["https://example.com", "https://example.com/2"], parameters=self.dto.model_dump(), - ) + ) \ No newline at end of file diff --git a/collector_manager/constants.py b/collector_manager/constants.py new file mode 100644 index 00000000..444fad06 --- /dev/null +++ b/collector_manager/constants.py @@ -0,0 +1,14 @@ +from collector_manager.enums import CollectorType + +ASYNC_COLLECTORS = [ + CollectorType.AUTO_GOOGLER, + CollectorType.EXAMPLE +] + +SYNC_COLLECTORS = [ + CollectorType.MUCKROCK_SIMPLE_SEARCH, + CollectorType.MUCKROCK_COUNTY_SEARCH, + CollectorType.MUCKROCK_ALL_SEARCH, + CollectorType.CKAN, + CollectorType.COMMON_CRAWLER, +] \ No newline at end of file diff --git a/core/AsyncCore.py b/core/AsyncCore.py index d95efbfe..c7626111 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,18 +1,29 @@ import logging +from http import HTTPStatus +from http.client import HTTPException from typing import Optional +from pydantic import BaseModel from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType +from collector_manager.AsyncCollectorManager import AsyncCollectorManager +from collector_manager.CollectorManager import CollectorManager +from collector_manager.constants import ASYNC_COLLECTORS +from collector_manager.enums import CollectorType +from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo from core.DTOs.GetTasksResponse import GetTasksResponse +from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.MessageResponse import MessageResponse from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase @@ -41,7 +52,8 @@ def __init__( huggingface_interface: HuggingFaceInterface, url_request_interface: URLRequestInterface, html_parser: HTMLResponseParser, - discord_poster: DiscordPoster + discord_poster: DiscordPoster, + collector_manager: AsyncCollectorManager ): self.adb_client = adb_client self.huggingface_interface = huggingface_interface @@ -51,11 +63,66 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) self.discord_poster = discord_poster + self.collector_manager = collector_manager async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: return await self.adb_client.get_urls(page=page, errors=errors) + async def shutdown(self): + await self.collector_manager.shutdown_all_collectors() + + #region Batch + async def get_batch_info(self, batch_id: int) -> BatchInfo: + return await self.adb_client.get_batch_by_id(batch_id) + + async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> GetURLsByBatchResponse: + url_infos = await self.adb_client.get_urls_by_batch(batch_id, page) + return GetURLsByBatchResponse(urls=url_infos) + + async def abort_batch(self, batch_id: int) -> MessageResponse: + await self.collector_manager.abort_collector_async(cid=batch_id) + return MessageResponse(message=f"Batch aborted.") + + #endregion + + # region Collector + async def initiate_collector( + self, + collector_type: CollectorType, + user_id: int, + dto: Optional[BaseModel] = None, + ): + """ + Reserves a batch ID from the database + and starts the requisite collector + """ + if collector_type not in ASYNC_COLLECTORS: + raise HTTPException( + f"Collector type {collector_type} is not supported", + HTTPStatus.BAD_REQUEST + ) + + batch_info = BatchInfo( + strategy=collector_type.value, + status=BatchStatus.IN_PROCESS, + parameters=dto.model_dump(), + user_id=user_id + ) + + batch_id = await self.adb_client.insert_batch(batch_info) + await self.collector_manager.start_async_collector( + collector_type=collector_type, + batch_id=batch_id, + dto=dto + ) + return CollectorStartInfo( + batch_id=batch_id, + message=f"Started {collector_type.value} collector." + ) + + # endregion + #region Task Operators async def get_url_html_task_operator(self): diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index cf4ad3a3..585bcb52 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -21,27 +21,18 @@ class SourceCollectorCore: def __init__( self, core_logger: CoreLogger, + collector_manager: CollectorManager, db_client: DatabaseClient = DatabaseClient(), dev_mode: bool = False ): self.db_client = db_client self.core_logger = core_logger - self.collector_manager = CollectorManager( - logger=core_logger, - db_client=db_client - ) + self.collector_manager = collector_manager if not dev_mode: self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) else: self.scheduled_task_manager = None - def get_batch_info(self, batch_id: int) -> BatchInfo: - return self.db_client.get_batch_by_id(batch_id) - - def get_urls_by_batch(self, batch_id: int, page: int = 1) -> GetURLsByBatchResponse: - url_infos = self.db_client.get_urls_by_batch(batch_id, page) - return GetURLsByBatchResponse(urls=url_infos) - def get_duplicate_urls_by_batch(self, batch_id: int, page: int = 1) -> GetDuplicatesByBatchResponse: dup_infos = self.db_client.get_duplicates_by_batch_id(batch_id, page=page) return GetDuplicatesByBatchResponse(duplicates=dup_infos) diff --git a/requirements.txt b/requirements.txt index 48f86981..911e66fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests~=2.31.0 +requests~=2.32.3 python-dotenv~=1.0.1 bs4~=0.0.2 tqdm>=4.64.1 @@ -9,7 +9,7 @@ psycopg2-binary~=2.9.6 pandas~=2.2.3 datasets~=2.19.1 # common_crawler only -huggingface-hub~=0.22.2 +huggingface-hub~=0.28.1 # html_tag_collector_only lxml~=5.1.0 @@ -19,13 +19,13 @@ beautifulsoup4>=4.12.3 from-root~=1.3.0 # Google Collector -google-api-python-client>=2.156.0 +google-api-python-client>=2.156.0 # TODO: Check for delete marshmallow~=3.23.2 sqlalchemy~=2.0.36 fastapi[standard]~=0.115.6 httpx~=0.28.1 -ckanapi~=4.8 +ckanapi~=4.8 # TODO: Check for delete psycopg[binary]~=3.1.20 APScheduler~=3.11.0 alembic~=1.14.0 @@ -46,4 +46,9 @@ PyJWT~=2.10.1 pytest-timeout~=2.3.1 openai~=1.60.1 -aiohttp~=3.11.11 \ No newline at end of file +aiohttp~=3.11.11 +uvicorn~=0.34.0 +pydantic~=2.10.6 +starlette~=0.45.3 +numpy~=1.26.4 +docker~=7.1.0 \ No newline at end of file diff --git a/source_collectors/auto_googler/AutoGoogler.py b/source_collectors/auto_googler/AutoGoogler.py index 937466be..368f75fb 100644 --- a/source_collectors/auto_googler/AutoGoogler.py +++ b/source_collectors/auto_googler/AutoGoogler.py @@ -1,3 +1,5 @@ +import asyncio + from source_collectors.auto_googler.DTOs import GoogleSearchQueryResultsInnerDTO from source_collectors.auto_googler.GoogleSearcher import GoogleSearcher from source_collectors.auto_googler.SearchConfig import SearchConfig @@ -16,14 +18,14 @@ def __init__(self, search_config: SearchConfig, google_searcher: GoogleSearcher) query : [] for query in search_config.queries } - def run(self) -> str: + async def run(self) -> str: """ Runs the AutoGoogler Yields status messages """ for query in self.search_config.queries: yield f"Searching for '{query}' ..." - results = self.google_searcher.search(query) + results = await self.google_searcher.search(query) yield f"Found {len(results)} results for '{query}'." if results is not None: self.data[query] = results diff --git a/source_collectors/auto_googler/AutoGooglerCollector.py b/source_collectors/auto_googler/AutoGooglerCollector.py index 189eaa11..b678f066 100644 --- a/source_collectors/auto_googler/AutoGooglerCollector.py +++ b/source_collectors/auto_googler/AutoGooglerCollector.py @@ -1,4 +1,6 @@ -from collector_manager.CollectorBase import CollectorBase +import asyncio + +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.enums import CollectorType from core.preprocessors.AutoGooglerPreprocessor import AutoGooglerPreprocessor from source_collectors.auto_googler.AutoGoogler import AutoGoogler @@ -8,11 +10,11 @@ from util.helper_functions import get_from_env, base_model_list_dump -class AutoGooglerCollector(CollectorBase): +class AutoGooglerCollector(AsyncCollectorBase): collector_type = CollectorType.AUTO_GOOGLER preprocessor = AutoGooglerPreprocessor - def run_implementation(self) -> None: + async def run_to_completion(self) -> AutoGoogler: dto: AutoGooglerInputDTO = self.dto auto_googler = AutoGoogler( search_config=SearchConfig( @@ -24,8 +26,13 @@ def run_implementation(self) -> None: cse_id=get_from_env("GOOGLE_CSE_ID"), ) ) - for log in auto_googler.run(): + async for log in auto_googler.run(): self.log(log) + return auto_googler + + async def run_implementation(self) -> None: + + auto_googler = await self.run_to_completion() inner_data = [] for query in auto_googler.search_config.queries: diff --git a/source_collectors/auto_googler/GoogleSearcher.py b/source_collectors/auto_googler/GoogleSearcher.py index 7d599513..fe52ea45 100644 --- a/source_collectors/auto_googler/GoogleSearcher.py +++ b/source_collectors/auto_googler/GoogleSearcher.py @@ -1,5 +1,7 @@ +import asyncio from typing import Union +import aiohttp from googleapiclient.discovery import build from googleapiclient.errors import HttpError @@ -28,8 +30,7 @@ class GoogleSearcher: search results as dictionaries or None if the daily quota for the API has been exceeded. Raises a RuntimeError if any other error occurs during the search. """ - GOOGLE_SERVICE_NAME = "customsearch" - GOOGLE_SERVICE_VERSION = "v1" + GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1" def __init__( self, @@ -41,11 +42,7 @@ def __init__( self.api_key = api_key self.cse_id = cse_id - self.service = build(self.GOOGLE_SERVICE_NAME, - self.GOOGLE_SERVICE_VERSION, - developerKey=self.api_key) - - def search(self, query: str) -> Union[list[dict], None]: + async def search(self, query: str) -> Union[list[dict], None]: """ Searches for results using the specified query. @@ -56,7 +53,7 @@ def search(self, query: str) -> Union[list[dict], None]: If the daily quota is exceeded, None is returned. """ try: - return self.get_query_results(query) + return await self.get_query_results(query) # Process your results except HttpError as e: if "Quota exceeded" in str(e): @@ -64,11 +61,23 @@ def search(self, query: str) -> Union[list[dict], None]: else: raise RuntimeError(f"An error occurred: {str(e)}") - def get_query_results(self, query) -> list[GoogleSearchQueryResultsInnerDTO] or None: - results = self.service.cse().list(q=query, cx=self.cse_id).execute() + async def get_query_results(self, query) -> list[GoogleSearchQueryResultsInnerDTO] or None: + params = { + "key": self.api_key, + "cx": self.cse_id, + "q": query, + } + + async with aiohttp.ClientSession() as session: + async with session.get(self.GOOGLE_SEARCH_URL, params=params) as response: + response.raise_for_status() + results = await response.json() + if "items" not in results: return None + items = [] + for item in results["items"]: inner_dto = GoogleSearchQueryResultsInnerDTO( url=item["link"], diff --git a/source_collectors/ckan/CKANAPIInterface.py b/source_collectors/ckan/CKANAPIInterface.py index 551ed023..563d795d 100644 --- a/source_collectors/ckan/CKANAPIInterface.py +++ b/source_collectors/ckan/CKANAPIInterface.py @@ -1,13 +1,13 @@ +import asyncio from typing import Optional -from ckanapi import RemoteCKAN, NotFound +import aiohttp +from aiohttp import ContentTypeError class CKANAPIError(Exception): pass -# TODO: Maybe return Base Models? - class CKANAPIInterface: """ Interfaces with the CKAN API @@ -15,22 +15,47 @@ class CKANAPIInterface: def __init__(self, base_url: str): self.base_url = base_url - self.remote = RemoteCKAN(base_url, get_only=True) - - def package_search(self, query: str, rows: int, start: int, **kwargs): - return self.remote.action.package_search(q=query, rows=rows, start=start, **kwargs) - def get_organization(self, organization_id: str): + @staticmethod + def _serialize_params(params: dict) -> dict: + return { + k: str(v).lower() if isinstance(v, bool) else str(v) for k, v in params.items() + } + + async def _get(self, action: str, params: dict): + url = f"{self.base_url}/api/3/action/{action}" + serialized_params = self._serialize_params(params) + async with aiohttp.ClientSession() as session: + async with session.get(url, params=serialized_params) as response: + try: + data = await response.json() + if not data.get("success", False): + raise CKANAPIError(f"Request failed: {data}") + except ContentTypeError: + raise CKANAPIError(f"Request failed: {response.text()}") + return data["result"] + + async def package_search(self, query: str, rows: int, start: int, **kwargs): + return await self._get("package_search", { + "q": query, "rows": rows, "start": start, **kwargs + }) + + async def get_organization(self, organization_id: str): try: - return self.remote.action.organization_show(id=organization_id, include_datasets=True) - except NotFound as e: - raise CKANAPIError(f"Organization {organization_id} not found" - f" for url {self.base_url}. Original error: {e}") - - def get_group_package(self, group_package_id: str, limit: Optional[int]): + return await self._get("organization_show", { + "id": organization_id, "include_datasets": True + }) + except CKANAPIError as e: + raise CKANAPIError( + f"Organization {organization_id} not found for url {self.base_url}. {e}" + ) + + async def get_group_package(self, group_package_id: str, limit: Optional[int]): try: - return self.remote.action.group_package_show(id=group_package_id, limit=limit) - except NotFound as e: - raise CKANAPIError(f"Group Package {group_package_id} not found" - f" for url {self.base_url}. Original error: {e}") - + return await self._get("group_package_show", { + "id": group_package_id, "limit": limit + }) + except CKANAPIError as e: + raise CKANAPIError( + f"Group Package {group_package_id} not found for url {self.base_url}. {e}" + ) \ No newline at end of file diff --git a/source_collectors/ckan/CKANCollector.py b/source_collectors/ckan/CKANCollector.py index 24477aad..873a8593 100644 --- a/source_collectors/ckan/CKANCollector.py +++ b/source_collectors/ckan/CKANCollector.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from collector_manager.CollectorBase import CollectorBase +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.enums import CollectorType from core.preprocessors.CKANPreprocessor import CKANPreprocessor from source_collectors.ckan.DTOs import CKANInputDTO @@ -16,30 +16,35 @@ "organization_search": ckan_package_search_from_organization } -class CKANCollector(CollectorBase): +class CKANCollector(AsyncCollectorBase): collector_type = CollectorType.CKAN preprocessor = CKANPreprocessor - def run_implementation(self): - results = self.get_results() + async def run_implementation(self): + results = await self.get_results() flat_list = get_flat_list(results) deduped_flat_list = deduplicate_entries(flat_list) - list_with_collection_child_packages = self.add_collection_child_packages(deduped_flat_list) + list_with_collection_child_packages = await self.add_collection_child_packages(deduped_flat_list) - filtered_results = list(filter(filter_result, list_with_collection_child_packages)) + filtered_results = list( + filter( + filter_result, + list_with_collection_child_packages + ) + ) parsed_results = list(map(parse_result, filtered_results)) self.data = {"results": parsed_results} - def add_collection_child_packages(self, deduped_flat_list): + async def add_collection_child_packages(self, deduped_flat_list): # TODO: Find a way to clearly indicate which parts call from the CKAN API list_with_collection_child_packages = [] count = len(deduped_flat_list) for idx, result in enumerate(deduped_flat_list): if "extras" in result.keys(): - self.log(f"Found collection ({idx + 1}/{count}): {result['id']}") - collections = get_collections(result) + await self.log(f"Found collection ({idx + 1}/{count}): {result['id']}") + collections = await get_collections(result) if collections: list_with_collection_child_packages += collections[0] continue @@ -47,16 +52,16 @@ def add_collection_child_packages(self, deduped_flat_list): list_with_collection_child_packages.append(result) return list_with_collection_child_packages - def get_results(self): + async def get_results(self): results = [] dto: CKANInputDTO = self.dto for search in SEARCH_FUNCTION_MAPPINGS.keys(): - self.log(f"Running search '{search}'...") + await self.log(f"Running search '{search}'...") sub_dtos: list[BaseModel] = getattr(dto, search) if sub_dtos is None: continue func = SEARCH_FUNCTION_MAPPINGS[search] - results = perform_search( + results = await perform_search( search_func=func, search_terms=base_model_list_dump(model_list=sub_dtos), results=results diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py index 3d5c7296..641dec2a 100644 --- a/source_collectors/ckan/ckan_scraper_toolkit.py +++ b/source_collectors/ckan/ckan_scraper_toolkit.py @@ -1,16 +1,14 @@ """Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals""" - +import asyncio import math import sys -import time -from concurrent.futures import as_completed, ThreadPoolExecutor from dataclasses import dataclass, field from datetime import datetime from typing import Any, Optional from urllib.parse import urljoin -import requests -from bs4 import BeautifulSoup +import aiohttp +from bs4 import BeautifulSoup, ResultSet, Tag from source_collectors.ckan.CKANAPIInterface import CKANAPIInterface @@ -46,7 +44,7 @@ def to_dict(self): } -def ckan_package_search( +async def ckan_package_search( base_url: str, query: Optional[str] = None, rows: Optional[int] = sys.maxsize, @@ -69,7 +67,7 @@ def ckan_package_search( while start < rows: num_rows = rows - start + offset - packages: dict = interface.package_search( + packages: dict = await interface.package_search( query=query, rows=num_rows, start=start, **kwargs ) add_base_url_to_packages(base_url, packages) @@ -94,7 +92,7 @@ def add_base_url_to_packages(base_url, packages): [package.update(base_url=base_url) for package in packages["results"]] -def ckan_package_search_from_organization( +async def ckan_package_search_from_organization( base_url: str, organization_id: str ) -> list[dict[str, Any]]: """Returns a list of CKAN packages from an organization. Only 10 packages are able to be returned. @@ -104,22 +102,22 @@ def ckan_package_search_from_organization( :return: List of dictionaries representing the packages associated with the organization. """ interface = CKANAPIInterface(base_url) - organization = interface.get_organization(organization_id) + organization = await interface.get_organization(organization_id) packages = organization["packages"] - results = search_for_results(base_url, packages) + results = await search_for_results(base_url, packages) return results -def search_for_results(base_url, packages): +async def search_for_results(base_url, packages): results = [] for package in packages: query = f"id:{package['id']}" - results += ckan_package_search(base_url=base_url, query=query) + results += await ckan_package_search(base_url=base_url, query=query) return results -def ckan_group_package_show( +async def ckan_group_package_show( base_url: str, id: str, limit: Optional[int] = sys.maxsize ) -> list[dict[str, Any]]: """Returns a list of CKAN packages from a group. @@ -130,13 +128,13 @@ def ckan_group_package_show( :return: List of dictionaries representing the packages associated with the group. """ interface = CKANAPIInterface(base_url) - results = interface.get_group_package(group_package_id=id, limit=limit) + results = await interface.get_group_package(group_package_id=id, limit=limit) # Add the base_url to each package [package.update(base_url=base_url) for package in results] return results -def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: +async def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: """Returns a list of CKAN packages from a collection. :param base_url: Base URL of the CKAN portal before the collection ID. e.g. "https://catalog.data.gov/dataset/" @@ -144,50 +142,36 @@ def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: :return: List of Package objects representing the packages associated with the collection. """ url = f"{base_url}?collection_package_id={collection_id}" - soup = _get_soup(url) + soup = await _get_soup(url) # Calculate the total number of pages of packages num_results = int(soup.find(class_="new-results").text.split()[0].replace(",", "")) pages = math.ceil(num_results / 20) - packages = get_packages(base_url, collection_id, pages) + packages = await get_packages(base_url, collection_id, pages) return packages -def get_packages(base_url, collection_id, pages): +async def get_packages(base_url, collection_id, pages): packages = [] for page in range(1, pages + 1): url = f"{base_url}?collection_package_id={collection_id}&page={page}" - soup = _get_soup(url) + soup = await _get_soup(url) - futures = get_futures(base_url, packages, soup) + packages = [] + for dataset_content in soup.find_all(class_="dataset-content"): + await asyncio.sleep(1) + package = await _collection_search_get_package_data(dataset_content, base_url) + packages.append(package) - # Take a break to avoid being timed out - if len(futures) >= 15: - time.sleep(10) return packages - -def get_futures(base_url: str, packages: list[Package], soup: BeautifulSoup) -> list[Any]: - """Returns a list of futures for the collection search.""" - with ThreadPoolExecutor(max_workers=10) as executor: - futures = [ - executor.submit( - _collection_search_get_package_data, dataset_content, base_url - ) - for dataset_content in soup.find_all(class_="dataset-content") - ] - - [packages.append(package.result()) for package in as_completed(futures)] - return futures - - -def _collection_search_get_package_data(dataset_content, base_url: str): +async def _collection_search_get_package_data(dataset_content, base_url: str): """Parses the dataset content and returns a Package object.""" package = Package() joined_url = urljoin(base_url, dataset_content.a.get("href")) - dataset_soup = _get_soup(joined_url) + dataset_soup = await _get_soup(joined_url) # Determine if the dataset url should be the linked page to an external site or the current site resources = get_resources(dataset_soup) button = get_button(resources) @@ -214,7 +198,9 @@ def get_data(dataset_soup): return dataset_soup.find(property="dct:modified").text.strip() -def get_button(resources): +def get_button(resources: ResultSet) -> Optional[Tag]: + if len(resources) == 0: + return None return resources[0].find(class_="btn-group") @@ -224,7 +210,12 @@ def get_resources(dataset_soup): ) -def set_url_and_data_portal_type(button, joined_url, package, resources): +def set_url_and_data_portal_type( + button: Optional[Tag], + joined_url: str, + package: Package, + resources: ResultSet +): if len(resources) == 1 and button is not None and button.a.text == "Visit page": package.url = button.a.get("href") else: @@ -255,8 +246,9 @@ def set_description(dataset_soup, package): package.description = dataset_soup.find(class_="notes").p.text -def _get_soup(url: str) -> BeautifulSoup: +async def _get_soup(url: str) -> BeautifulSoup: """Returns a BeautifulSoup object for the given URL.""" - time.sleep(1) - response = requests.get(url) - return BeautifulSoup(response.content, "lxml") + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + response.raise_for_status() + return BeautifulSoup(await response.text(), "lxml") diff --git a/source_collectors/ckan/main.py b/source_collectors/ckan/main.py index cc6f8da7..091d2642 100644 --- a/source_collectors/ckan/main.py +++ b/source_collectors/ckan/main.py @@ -6,24 +6,24 @@ -def main(): +async def main(): """ Main function. """ results = [] print("Gathering results...") - results = perform_search( + results = await perform_search( search_func=ckan_package_search, search_terms=package_search, results=results, ) - results = perform_search( + results = await perform_search( search_func=ckan_group_package_show, search_terms=group_search, results=results, ) - results = perform_search( + results = await perform_search( search_func=ckan_package_search_from_organization, search_terms=organization_search, results=results, diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index 9e0b2ff1..ad3d62e2 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -15,7 +15,7 @@ sys.path.insert(1, str(p)) -def perform_search( +async def perform_search( search_func: Callable, search_terms: list[dict[str, Any]], results: list[dict[str, Any]], @@ -34,14 +34,14 @@ def perform_search( for search in tqdm(search_terms): item_results = [] for item in search[key]: - item_result = search_func(search["url"], item) + item_result = await search_func(search["url"], item) item_results.append(item_result) results += item_results return results -def get_collection_child_packages( +async def get_collection_child_packages( results: list[dict[str, Any]] ) -> list[dict[str, Any]]: """Retrieves the child packages of each collection. @@ -53,7 +53,7 @@ def get_collection_child_packages( for result in tqdm(results): if "extras" in result.keys(): - collections = get_collections(result) + collections = await get_collections(result) if collections: new_list += collections[0] continue @@ -63,15 +63,17 @@ def get_collection_child_packages( return new_list -def get_collections(result): - collections = [ - ckan_collection_search( - base_url="https://catalog.data.gov/dataset/", - collection_id=result["id"], - ) - for extra in result["extras"] - if parent_package_has_no_resources(extra=extra, result=result) - ] +async def get_collections(result): + if "extras" not in result.keys(): + return [] + + collections = [] + for extra in result["extras"]: + if parent_package_has_no_resources(extra=extra, result=result): + collections.append(await ckan_collection_search( + base_url="https://catalog.data.gov/dataset/", + collection_id=result["id"], + )) return collections diff --git a/source_collectors/common_crawler/CommonCrawler.py b/source_collectors/common_crawler/CommonCrawler.py index 78d986cb..2bd2143c 100644 --- a/source_collectors/common_crawler/CommonCrawler.py +++ b/source_collectors/common_crawler/CommonCrawler.py @@ -1,64 +1,76 @@ +import asyncio import json import time from http import HTTPStatus +from typing import Union from urllib.parse import quote_plus -import requests +import aiohttp from source_collectors.common_crawler.utils import URLWithParameters - -def make_request(search_url: URLWithParameters) -> requests.Response: +async def async_make_request( + search_url: 'URLWithParameters' +) -> Union[aiohttp.ClientResponse, None]: """ - Makes the HTTP GET request to the given search URL. - Return the response if successful, None if rate-limited. + Makes the HTTP GET request to the given search URL using aiohttp. + Return the response if successful, None if rate-limited or failed. """ try: - response = requests.get(str(search_url)) - response.raise_for_status() - return response - except requests.exceptions.RequestException as e: - if ( - response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR - and "SlowDown" in response.text - ): - return None - else: - print(f"Failed to get records: {e}") - return None - - -def process_response( - response: requests.Response, url: str, page: int -) -> list[str] or None: + async with aiohttp.ClientSession() as session: + async with session.get(str(search_url)) as response: + text = await response.text() + if ( + response.status == HTTPStatus.INTERNAL_SERVER_ERROR + and "SlowDown" in text + ): + return None + response.raise_for_status() + # simulate requests.Response interface for downstream compatibility + response.text_content = text # custom attribute for downstream use + response.status_code = response.status + return response + except aiohttp.ClientError as e: + print(f"Failed to get records: {e}") + return None + + +def make_request( + search_url: 'URLWithParameters' +) -> Union[aiohttp.ClientResponse, None]: + """Synchronous wrapper around the async function.""" + return asyncio.run(async_make_request(search_url)) + + +def process_response(response, url: str, page: int) -> Union[list[str], None]: """Processes the HTTP response and returns the parsed records if successful.""" + if response is None: + return None + if response.status_code == HTTPStatus.OK: - records = response.text.strip().split("\n") + records = response.text_content.strip().split("\n") print(f"Found {len(records)} records for {url} on page {page}") results = [] for record in records: d = json.loads(record) results.append(d["url"]) return results - if "First Page is 0, Last Page is 0" in response.text: + + if "First Page is 0, Last Page is 0" in response.text_content: print("No records exist in index matching the url search term") return None + print(f"Unexpected response: {response.status_code}") return None + def get_common_crawl_search_results( - search_url: URLWithParameters, + search_url: 'URLWithParameters', query_url: str, page: int -) -> list[str] or None: +) -> Union[list[str], None]: response = make_request(search_url) - processed_data = process_response( - response=response, - url=query_url, - page=page - ) - # TODO: POINT OF MOCK - return processed_data + return process_response(response, query_url, page) diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py index 72ce8336..466478c7 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py @@ -1,7 +1,9 @@ import abc +import asyncio from abc import ABC import requests +import aiohttp from source_collectors.muckrock.classes.fetch_requests.FetchRequestBase import FetchRequest @@ -12,30 +14,18 @@ class MuckrockNoMoreDataError(Exception): class MuckrockServerError(Exception): pass -def fetch_muckrock_data_from_url(url: str) -> dict | None: - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError as e: - print(f"Failed to get records on request `{url}`: {e}") - # If code is 404, raise NoMoreData error - if e.response.status_code == 404: - raise MuckrockNoMoreDataError - if 500 <= e.response.status_code < 600: - raise MuckrockServerError - return None - - # TODO: POINT OF MOCK - data = response.json() - return data - class MuckrockFetcher(ABC): + async def get_async_request(self, url: str) -> dict | None: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + response.raise_for_status() + return await response.json() + def fetch(self, request: FetchRequest) -> dict | None: url = self.build_url(request) - response = requests.get(url) try: - response.raise_for_status() + return asyncio.run(self.get_async_request(url)) except requests.exceptions.HTTPError as e: print(f"Failed to get records on request `{url}`: {e}") # If code is 404, raise NoMoreData error @@ -45,10 +35,6 @@ def fetch(self, request: FetchRequest) -> dict | None: raise MuckrockServerError return None - # TODO: POINT OF MOCK - data = response.json() - return data - @abc.abstractmethod def build_url(self, request: FetchRequest) -> str: pass diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py index 30024d36..7e5105d7 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py @@ -1,5 +1,7 @@ +import asyncio from abc import ABC, abstractmethod +import aiohttp import requests from source_collectors.muckrock.classes.exceptions.RequestFailureException import RequestFailureException @@ -11,15 +13,18 @@ class MuckrockIterFetcherBase(ABC): def __init__(self, initial_request: FetchRequest): self.initial_request = initial_request + async def get_response_async(self, url) -> dict: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + response.raise_for_status() + return await response.json() + def get_response(self, url) -> dict: - # TODO: POINT OF MOCK - response = requests.get(url) try: - response.raise_for_status() + return asyncio.run(self.get_response_async(url)) except requests.exceptions.HTTPError as e: print(f"Failed to get records on request `{url}`: {e}") raise RequestFailureException - return response.json() @abstractmethod def process_results(self, results: list[dict]): diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index c962e1e7..f2b2c098 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -10,6 +10,7 @@ def test_auto_googler_collector_lifecycle(test_core): + # TODO: Rework for Async ci = test_core db_client = api.dependencies.db_client diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index 0fbebfa4..53fb711d 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -1,7 +1,9 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock +import pytest from marshmallow import Schema, fields +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector @@ -18,8 +20,8 @@ class CKANSchema(Schema): data_portal_type = fields.String() source_last_updated = fields.String() - -def test_ckan_collector_default(): +@pytest.mark.asyncio +async def test_ckan_collector_default(): collector = CKANCollector( batch_id=1, dto=CKANInputDTO( @@ -30,15 +32,20 @@ def test_ckan_collector_default(): } ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() schema = CKANSchema(many=True) schema.load(collector.data["results"]) + print(collector.data) def test_ckan_collector_custom(): + """ + Use this to test how CKAN behaves when using + something other than the default options provided + """ collector = CKANCollector( batch_id=1, dto=CKANInputDTO( diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index 9a7bc5d4..65ec778d 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -19,4 +19,5 @@ def test_common_crawler_collector(): db_client=MagicMock(spec=DatabaseClient) ) collector.run() + print(collector.data) CommonCrawlerSchema().load(collector.data) diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index 2065463e..c2e537b1 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -1,3 +1,4 @@ +import asyncio from dataclasses import dataclass from typing import Generator from unittest.mock import MagicMock @@ -39,6 +40,7 @@ def client(db_client_test, monkeypatch) -> Generator[TestClient, None, None]: yield c core.shutdown() + @pytest.fixture def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> APITestHelper: diff --git a/tests/test_automated/integration/api/test_batch.py b/tests/test_automated/integration/api/test_batch.py index 61c2a8b2..69c2fcab 100644 --- a/tests/test_automated/integration/api/test_batch.py +++ b/tests/test_automated/integration/api/test_batch.py @@ -1,3 +1,4 @@ +import asyncio import time from collector_db.DTOs.BatchInfo import BatchInfo diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 2e7895d8..c31676b6 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -25,7 +25,9 @@ def test_example_collector(api_test_helper): assert batch_id is not None assert data["message"] == "Started example collector." - bsr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses(status=BatchStatus.IN_PROCESS) + bsr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( + status=BatchStatus.IN_PROCESS + ) assert len(bsr.results) == 1 bsi: BatchStatusInfo = bsr.results[0] @@ -36,7 +38,10 @@ def test_example_collector(api_test_helper): time.sleep(2) - csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses(collector_type=CollectorType.EXAMPLE, status=BatchStatus.COMPLETE) + csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( + collector_type=CollectorType.EXAMPLE, + status=BatchStatus.COMPLETE + ) assert len(csr.results) == 1 bsi: BatchStatusInfo = csr.results[0] @@ -57,7 +62,6 @@ def test_example_collector(api_test_helper): lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - assert len(lr.logs) > 0 def test_example_collector_error(api_test_helper, monkeypatch): @@ -91,6 +95,8 @@ def test_example_collector_error(api_test_helper, monkeypatch): ath.core.core_logger.flush_all() + time.sleep(10) + gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) assert gbl.logs[-1].log == "Error: Collector failed!" diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index 89e6b753..4377fd76 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -1,6 +1,11 @@ +from unittest.mock import MagicMock import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_manager.AsyncCollectorManager import AsyncCollectorManager +from collector_manager.CollectorManager import CollectorManager +from core.AsyncCore import AsyncCore from core.CoreLogger import CoreLogger from core.SourceCollectorCore import SourceCollectorCore @@ -12,9 +17,34 @@ def test_core(db_client_test): ) as logger: core = SourceCollectorCore( db_client=db_client_test, + collector_manager=CollectorManager( + db_client=db_client_test, + logger=logger + ), core_logger=logger, dev_mode=True ) yield core core.shutdown() + +@pytest.fixture +def test_async_core(db_client_test): + with CoreLogger( + db_client=db_client_test + ) as logger: + adb_client = AsyncDatabaseClient() + core = AsyncCore( + adb_client=adb_client, + huggingface_interface=MagicMock(), + url_request_interface=MagicMock(), + html_parser=MagicMock(), + discord_poster=MagicMock(), + collector_manager=AsyncCollectorManager( + adb_client=adb_client, + logger=logger, + dev_mode=True + ), + ) + yield core + core.shutdown() \ No newline at end of file diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index 4aa51b77..3fe10580 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -55,7 +55,8 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): huggingface_interface=MagicMock(), url_request_interface=MagicMock(), html_parser=MagicMock(), - discord_poster=MagicMock() + discord_poster=MagicMock(), + collector_manager=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -83,7 +84,8 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): huggingface_interface=MagicMock(), url_request_interface=MagicMock(), html_parser=MagicMock(), - discord_poster=MagicMock() + discord_poster=MagicMock(), + collector_manager=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -100,7 +102,8 @@ async def test_run_task_prereq_not_met(): huggingface_interface=AsyncMock(), url_request_interface=AsyncMock(), html_parser=AsyncMock(), - discord_poster=MagicMock() + discord_poster=MagicMock(), + collector_manager=MagicMock() ) mock_operator = AsyncMock() @@ -126,7 +129,8 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: huggingface_interface=AsyncMock(), url_request_interface=AsyncMock(), html_parser=AsyncMock(), - discord_poster=MagicMock() + discord_poster=MagicMock(), + collector_manager=MagicMock() ) core.conclude_task = AsyncMock() @@ -170,7 +174,8 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: huggingface_interface=AsyncMock(), url_request_interface=AsyncMock(), html_parser=AsyncMock(), - discord_poster=MagicMock() + discord_poster=MagicMock(), + collector_manager=MagicMock() ) core.conclude_task = AsyncMock() diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index 65b9cd6c..064a93a4 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -1,25 +1,33 @@ +import asyncio import time +import pytest + from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType, URLStatus +from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.SourceCollectorCore import SourceCollectorCore from core.enums import BatchStatus - -def test_example_collector_lifecycle(test_core: SourceCollectorCore): +@pytest.mark.asyncio +async def test_example_collector_lifecycle( + test_core: SourceCollectorCore, + test_async_core: AsyncCore +): """ Test the flow of an example collector, which generates fake urls and saves them to the database """ + acore = test_async_core core = test_core db_client = core.db_client dto = ExampleInputDTO( example_field="example_value", sleep_time=1 ) - csi: CollectorStartInfo = core.initiate_collector( + csi: CollectorStartInfo = await acore.initiate_collector( collector_type=CollectorType.EXAMPLE, dto=dto, user_id=1 @@ -31,7 +39,7 @@ def test_example_collector_lifecycle(test_core: SourceCollectorCore): assert core.get_status(batch_id) == BatchStatus.IN_PROCESS print("Sleeping for 1.5 seconds...") - time.sleep(1.5) + await asyncio.sleep(1.5) print("Done sleeping...") assert core.get_status(batch_id) == BatchStatus.COMPLETE @@ -50,11 +58,16 @@ def test_example_collector_lifecycle(test_core: SourceCollectorCore): assert url_infos[0].url == "https://example.com" assert url_infos[1].url == "https://example.com/2" -def test_example_collector_lifecycle_multiple_batches(test_core: SourceCollectorCore): +@pytest.mark.asyncio +async def test_example_collector_lifecycle_multiple_batches( + test_core: SourceCollectorCore, + test_async_core: AsyncCore +): """ Test the flow of an example collector, which generates fake urls and saves them to the database """ + acore = test_async_core core = test_core csis: list[CollectorStartInfo] = [] for i in range(3): @@ -62,7 +75,7 @@ def test_example_collector_lifecycle_multiple_batches(test_core: SourceCollector example_field="example_value", sleep_time=1 ) - csi: CollectorStartInfo = core.initiate_collector( + csi: CollectorStartInfo = await acore.initiate_collector( collector_type=CollectorType.EXAMPLE, dto=dto, user_id=1 @@ -74,7 +87,7 @@ def test_example_collector_lifecycle_multiple_batches(test_core: SourceCollector print("Batch ID:", csi.batch_id) assert core.get_status(csi.batch_id) == BatchStatus.IN_PROCESS - time.sleep(6) + await asyncio.sleep(3) for csi in csis: assert core.get_status(csi.batch_id) == BatchStatus.COMPLETE diff --git a/tests/test_automated/integration/source_collectors/test_example_collector.py b/tests/test_automated/integration/source_collectors/test_example_collector.py index 0a6f9491..e69de29b 100644 --- a/tests/test_automated/integration/source_collectors/test_example_collector.py +++ b/tests/test_automated/integration/source_collectors/test_example_collector.py @@ -1,45 +0,0 @@ -import threading -import time - -from collector_db.DTOs.BatchInfo import BatchInfo -from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO -from collector_manager.ExampleCollector import ExampleCollector -from core.SourceCollectorCore import SourceCollectorCore -from core.enums import BatchStatus - - -def test_live_example_collector_abort(test_core: SourceCollectorCore): - core = test_core - db_client = core.db_client - - batch_id = db_client.insert_batch( - BatchInfo( - strategy="example", - status=BatchStatus.IN_PROCESS, - parameters={}, - user_id=1 - ) - ) - - - dto = ExampleInputDTO( - sleep_time=3 - ) - - collector = ExampleCollector( - batch_id=batch_id, - dto=dto, - logger=core.core_logger, - db_client=db_client, - raise_error=True - ) - # Run collector in separate thread - thread = threading.Thread(target=collector.run) - thread.start() - collector.abort() - time.sleep(2) - thread.join() - - - assert db_client.get_batch_status(batch_id) == BatchStatus.ABORTED - diff --git a/tests/test_automated/unit/collector_manager/test_collector_manager.py b/tests/test_automated/unit/collector_manager/test_collector_manager.py index 3a7b2fd9..e69de29b 100644 --- a/tests/test_automated/unit/collector_manager/test_collector_manager.py +++ b/tests/test_automated/unit/collector_manager/test_collector_manager.py @@ -1,154 +0,0 @@ -import threading -import time -from dataclasses import dataclass -from unittest.mock import Mock, MagicMock - -import pytest - -from collector_db.DatabaseClient import DatabaseClient -from collector_manager.CollectorManager import CollectorManager, InvalidCollectorError -from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO -from collector_manager.ExampleCollector import ExampleCollector -from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger - - -@dataclass -class ExampleCollectorSetup: - type = CollectorType.EXAMPLE - dto = ExampleInputDTO( - example_field="example_value", sleep_time=1 - ) - manager = CollectorManager( - logger=Mock(spec=CoreLogger), - db_client=Mock(spec=DatabaseClient) - ) - - def start_collector(self, batch_id: int): - self.manager.start_collector(self.type, batch_id, self.dto) - - -@pytest.fixture -def ecs(): - ecs = ExampleCollectorSetup() - yield ecs - ecs.manager.shutdown_all_collectors() - - - -def test_start_collector(ecs: ExampleCollectorSetup): - manager = ecs.manager - - batch_id = 1 - ecs.start_collector(batch_id) - assert batch_id in manager.collectors, "Collector not added to manager." - future = manager.futures.get(batch_id) - assert future is not None, "Thread not started for collector." - # Check that future is running - assert future.running(), "Future is not running." - - - print("Test passed: Collector starts correctly.") - -def test_abort_collector(ecs: ExampleCollectorSetup): - batch_id = 2 - manager = ecs.manager - - ecs.start_collector(batch_id) - - # Try getting collector initially and succeed - collector = manager.try_getting_collector(batch_id) - assert collector is not None, "Collector not found after start." - - manager.abort_collector(batch_id) - - assert batch_id not in manager.collectors, "Collector not removed after closure." - assert batch_id not in manager.threads, "Thread not removed after closure." - - # Try getting collector after closure and fail - with pytest.raises(InvalidCollectorError) as e: - manager.try_getting_collector(batch_id) - - - -def test_invalid_collector(ecs: ExampleCollectorSetup): - invalid_batch_id = 999 - - with pytest.raises(InvalidCollectorError) as e: - ecs.manager.try_getting_collector(invalid_batch_id) - - -def test_concurrent_collectors(ecs: ExampleCollectorSetup): - manager = ecs.manager - - batch_ids = [1, 2, 3] - - threads = [] - for batch_id in batch_ids: - thread = threading.Thread(target=manager.start_collector, args=(ecs.type, batch_id, ecs.dto)) - threads.append(thread) - thread.start() - - for thread in threads: - thread.join() - - assert all(batch_id in manager.collectors for batch_id in batch_ids), "Not all collectors started." - assert all(manager.futures[batch_id].running() for batch_id in batch_ids), "Not all threads are running." - - print("Test passed: Concurrent collectors managed correctly.") - -def test_thread_safety(ecs: ExampleCollectorSetup): - import concurrent.futures - - manager = ecs.manager - - def start_and_close(batch_id): - ecs.start_collector(batch_id) - time.sleep(0.1) # Simulate some processing - manager.abort_collector(batch_id) - - batch_ids = [i for i in range(1, 6)] - - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - executor.map(start_and_close, batch_ids) - - assert not manager.collectors, "Some collectors were not cleaned up." - assert not manager.threads, "Some threads were not cleaned up." - - print("Test passed: Thread safety maintained under concurrent access.") - -def test_shutdown_all_collectors(ecs: ExampleCollectorSetup): - manager = ecs.manager - - batch_ids = [1, 2, 3] - - for batch_id in batch_ids: - ecs.start_collector(batch_id) - - manager.shutdown_all_collectors() - - assert not manager.collectors, "Not all collectors were removed." - assert not manager.threads, "Not all threads were cleaned up." - - print("Test passed: Shutdown cleans up all collectors and threads.") - - -def test_collector_manager_raises_exceptions(monkeypatch): - # Mock dependencies - logger = MagicMock() - db_client = MagicMock() - collector_manager = CollectorManager(logger=logger, db_client=db_client) - - dto = ExampleInputDTO(example_field="example_value", sleep_time=1) - - # Mock a collector type and DTO - batch_id = 1 - - # Patch the example collector run method to raise an exception - monkeypatch.setattr(ExampleCollector, 'run', MagicMock(side_effect=RuntimeError("Collector failed!"))) - - # Start the collector and expect an exception during shutdown - collector_manager.start_collector(CollectorType.EXAMPLE, batch_id, dto) - - with pytest.raises(RuntimeError, match="Collector failed!"): - collector_manager.shutdown_all_collectors() \ No newline at end of file diff --git a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py index 673fcd42..050b1299 100644 --- a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py @@ -1,7 +1,8 @@ -from unittest.mock import MagicMock +from unittest.mock import AsyncMock import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger @@ -12,7 +13,7 @@ @pytest.fixture def patch_get_query_results(monkeypatch): patch_path = "source_collectors.auto_googler.GoogleSearcher.GoogleSearcher.get_query_results" - mock = MagicMock() + mock = AsyncMock() mock.side_effect = [ [GoogleSearchQueryResultsInnerDTO(url="https://include.com/1", title="keyword", snippet="snippet 1"),], None @@ -20,21 +21,22 @@ def patch_get_query_results(monkeypatch): monkeypatch.setattr(patch_path, mock) yield mock -def test_auto_googler_collector(patch_get_query_results): +@pytest.mark.asyncio +async def test_auto_googler_collector(patch_get_query_results): mock = patch_get_query_results collector = AutoGooglerCollector( batch_id=1, dto=AutoGooglerInputDTO( queries=["keyword"] ), - logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + logger=AsyncMock(spec=CoreLogger), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() mock.assert_called_once_with("keyword") - collector.db_client.insert_urls.assert_called_once_with( + collector.adb_client.insert_urls.assert_called_once_with( url_infos=[URLInfo(url="https://include.com/1", collector_metadata={"query": "keyword", "title": "keyword", "snippet": "snippet 1"})], batch_id=1 ) \ No newline at end of file diff --git a/tests/test_automated/unit/source_collectors/test_example_collector.py b/tests/test_automated/unit/source_collectors/test_example_collector.py index a0cf0c6f..17512a6f 100644 --- a/tests/test_automated/unit/source_collectors/test_example_collector.py +++ b/tests/test_automated/unit/source_collectors/test_example_collector.py @@ -13,7 +13,7 @@ def test_example_collector(): sleep_time=1 ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=MagicMock(spec=DatabaseClient), raise_error=True ) collector.run() \ No newline at end of file From cb3ed94b5413e089fbd1354512629a86b8dfabd1 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 12 Apr 2025 13:19:20 -0400 Subject: [PATCH 095/182] DRAFT --- api/main.py | 7 +- api/routes/batch.py | 5 +- api/routes/collector.py | 23 ++- collector_db/DatabaseClient.py | 84 ----------- collector_manager/AsyncCollectorManager.py | 4 +- collector_manager/CollectorBase.py | 139 ------------------ collector_manager/CollectorManager.py | 103 ------------- collector_manager/constants.py | 11 +- core/AsyncCore.py | 1 - core/SourceCollectorCore.py | 50 +------ .../common_crawler/CommonCrawler.py | 16 +- .../common_crawler/CommonCrawlerCollector.py | 12 +- .../muckrock/classes/FOIASearcher.py | 12 +- .../muckrock/classes/MuckrockCollector.py | 47 +++--- .../muckrock_fetchers/AgencyFetcher.py | 4 +- .../classes/muckrock_fetchers/FOIAFetcher.py | 4 +- .../JurisdictionByIDFetcher.py | 4 +- .../muckrock_fetchers/MuckrockFetcher.py | 4 +- .../MuckrockIterFetcherBase.py | 4 +- .../muckrock_fetchers/MuckrockLoopFetcher.py | 4 +- .../muckrock_fetchers/MuckrockNextFetcher.py | 4 +- .../generate_detailed_muckrock_csv.py | 8 +- tests/conftest.py | 8 + tests/helpers/DBDataCreator.py | 27 ++-- .../test_html_tag_collector_integration.py | 6 +- .../test_autogoogler_collector.py | 13 +- .../source_collectors/test_ckan_collector.py | 11 +- .../test_common_crawler_collector.py | 12 +- .../test_muckrock_collectors.py | 37 +++-- .../integration/api/test_example_collector.py | 2 +- .../collector_db/test_db_client.py | 9 +- tests/test_automated/integration/conftest.py | 5 - .../core/helpers/common_test_procedures.py | 27 ---- .../source_collectors/test_ckan_collector.py | 20 +-- .../test_collector_closes_properly.py | 71 --------- .../test_common_crawl_collector.py | 12 +- .../test_muckrock_collectors.py | 41 +++--- 37 files changed, 202 insertions(+), 649 deletions(-) delete mode 100644 collector_manager/CollectorBase.py delete mode 100644 tests/test_automated/integration/core/helpers/common_test_procedures.py delete mode 100644 tests/test_automated/unit/source_collectors/test_collector_closes_properly.py diff --git a/api/main.py b/api/main.py index a38ead34..37521822 100644 --- a/api/main.py +++ b/api/main.py @@ -14,7 +14,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from collector_manager.AsyncCollectorManager import AsyncCollectorManager -from collector_manager.CollectorManager import CollectorManager from core.AsyncCore import AsyncCore from core.CoreLogger import CoreLogger from core.ScheduledTaskManager import AsyncScheduledTaskManager @@ -34,10 +33,6 @@ async def lifespan(app: FastAPI): adb_client = AsyncDatabaseClient() await setup_database(db_client) core_logger = CoreLogger(db_client=db_client) - collector_manager = CollectorManager( - logger=core_logger, - db_client=db_client, - ) async_collector_manager = AsyncCollectorManager( logger=core_logger, adb_client=adb_client, @@ -47,7 +42,6 @@ async def lifespan(app: FastAPI): db_client=db_client ), db_client=DatabaseClient(), - collector_manager=collector_manager ) async_core = AsyncCore( adb_client=adb_client, @@ -72,6 +66,7 @@ async def lifespan(app: FastAPI): yield # Code here runs before shutdown # Shutdown logic (if needed) + core_logger.shutdown() app.state.core.shutdown() # Clean up resources, close connections, etc. pass diff --git a/api/routes/batch.py b/api/routes/batch.py index 950b6931..23df2394 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -99,7 +99,4 @@ async def abort_batch( async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> MessageResponse: - try: - return core.abort_batch(batch_id) - except InvalidCollectorError as e: - return await async_core.abort_batch(batch_id) \ No newline at end of file + return await async_core.abort_batch(batch_id) \ No newline at end of file diff --git a/api/routes/collector.py b/api/routes/collector.py index 18c488b8..e2789443 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -1,12 +1,11 @@ from fastapi import APIRouter from fastapi.params import Depends -from api.dependencies import get_core, get_async_core +from api.dependencies import get_async_core from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo -from core.SourceCollectorCore import SourceCollectorCore from security_manager.SecurityManager import AccessInfo, get_access_info from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO from source_collectors.ckan.DTOs import CKANInputDTO @@ -38,13 +37,13 @@ async def start_example_collector( @collector_router.post("/ckan") async def start_ckan_collector( dto: CKANInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the ckan collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.CKAN, dto=dto, user_id=access_info.user_id @@ -53,13 +52,13 @@ async def start_ckan_collector( @collector_router.post("/common-crawler") async def start_common_crawler_collector( dto: CommonCrawlerInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the common crawler collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.COMMON_CRAWLER, dto=dto, user_id=access_info.user_id @@ -83,13 +82,13 @@ async def start_auto_googler_collector( @collector_router.post("/muckrock-simple") async def start_muckrock_collector( dto: MuckrockSimpleSearchCollectorInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the muckrock collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.MUCKROCK_SIMPLE_SEARCH, dto=dto, user_id=access_info.user_id @@ -98,13 +97,13 @@ async def start_muckrock_collector( @collector_router.post("/muckrock-county") async def start_muckrock_county_collector( dto: MuckrockCountySearchCollectorInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the muckrock county level collector """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.MUCKROCK_COUNTY_SEARCH, dto=dto, user_id=access_info.user_id @@ -113,13 +112,13 @@ async def start_muckrock_county_collector( @collector_router.post("/muckrock-all") async def start_muckrock_all_foia_collector( dto: MuckrockAllFOIARequestsCollectorInputDTO, - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> CollectorStartInfo: """ Start the muckrock collector for all FOIA requests """ - return core.initiate_collector( + return await core.initiate_collector( collector_type=CollectorType.MUCKROCK_ALL_SEARCH, dto=dto, user_id=access_info.user_id diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 372cca8e..06107651 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -3,25 +3,19 @@ from typing import Optional, List from sqlalchemy import create_engine, Row -from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, aliased from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.DuplicateInfo import DuplicateInfo, DuplicateInsertInfo -from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DTOs.URLMapping import URLMapping from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType from core.enums import BatchStatus -# SQLAlchemy ORM models - - # Database Client class DatabaseClient: def __init__(self, db_url: str = get_postgres_connection_string()): @@ -79,54 +73,12 @@ def insert_batch(self, session, batch_info: BatchInfo) -> int: session.refresh(batch) return batch.id - @session_manager - def update_batch_post_collection( - self, - session, - batch_id: int, - total_url_count: int, - original_url_count: int, - duplicate_url_count: int, - batch_status: BatchStatus, - compute_time: float = None, - ): - batch = session.query(Batch).filter_by(id=batch_id).first() - batch.total_url_count = total_url_count - batch.original_url_count = original_url_count - batch.duplicate_url_count = duplicate_url_count - batch.status = batch_status.value - batch.compute_time = compute_time - @session_manager def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchInfo]: """Retrieve a batch by ID.""" batch = session.query(Batch).filter_by(id=batch_id).first() return BatchInfo(**batch.__dict__) - def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: - url_mappings = [] - duplicates = [] - for url_info in url_infos: - url_info.batch_id = batch_id - try: - url_id = self.insert_url(url_info) - url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) - except IntegrityError: - orig_url_info = self.get_url_info_by_url(url_info.url) - duplicate_info = DuplicateInsertInfo( - duplicate_batch_id=batch_id, - original_url_id=orig_url_info.id - ) - duplicates.append(duplicate_info) - self.insert_duplicates(duplicates) - - return InsertURLsInfo( - url_mappings=url_mappings, - total_count=len(url_infos), - original_count=len(url_mappings), - duplicate_count=len(duplicates), - url_ids=[url_mapping.url_id for url_mapping in url_mappings] - ) @session_manager def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]): @@ -138,27 +90,6 @@ def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]) session.add(duplicate) - - @session_manager - def get_url_info_by_url(self, session, url: str) -> Optional[URLInfo]: - url = session.query(URL).filter_by(url=url).first() - return URLInfo(**url.__dict__) - - @session_manager - def insert_url(self, session, url_info: URLInfo) -> int: - """Insert a new URL into the database.""" - url_entry = URL( - batch_id=url_info.batch_id, - url=url_info.url, - collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value - ) - session.add(url_entry) - session.commit() - session.refresh(url_entry) - return url_entry.id - - @session_manager def get_urls_by_batch(self, session, batch_id: int, page: int = 1) -> List[URLInfo]: """Retrieve all URLs associated with a batch.""" @@ -166,11 +97,6 @@ def get_urls_by_batch(self, session, batch_id: int, page: int = 1) -> List[URLIn .order_by(URL.id).limit(100).offset((page - 1) * 100).all()) return ([URLInfo(**url.__dict__) for url in urls]) - @session_manager - def is_duplicate_url(self, session, url: str) -> bool: - result = session.query(URL).filter_by(url=url).first() - return result is not None - @session_manager def insert_logs(self, session, log_infos: List[LogInfo]): for log_info in log_infos: @@ -189,16 +115,6 @@ def get_all_logs(self, session) -> List[LogInfo]: logs = session.query(Log).all() return ([LogInfo(**log.__dict__) for log in logs]) - @session_manager - def add_duplicate_info(self, session, duplicate_infos: list[DuplicateInfo]): - # TODO: Add test for this method when testing CollectorDatabaseProcessor - for duplicate_info in duplicate_infos: - duplicate = Duplicate( - batch_id=duplicate_info.original_batch_id, - original_url_id=duplicate_info.original_url_id, - ) - session.add(duplicate) - @session_manager def get_batch_status(self, session, batch_id: int) -> BatchStatus: batch = session.query(Batch).filter_by(id=batch_id).first() diff --git a/collector_manager/AsyncCollectorManager.py b/collector_manager/AsyncCollectorManager.py index ecce57b6..af875ddc 100644 --- a/collector_manager/AsyncCollectorManager.py +++ b/collector_manager/AsyncCollectorManager.py @@ -6,7 +6,7 @@ from pydantic import BaseModel from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_manager.CollectorBase import CollectorBase +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.CollectorManager import InvalidCollectorError from collector_manager.collector_mapping import COLLECTOR_MAPPING from collector_manager.enums import CollectorType @@ -21,7 +21,7 @@ def __init__( adb_client: AsyncDatabaseClient, dev_mode: bool = False ): - self.collectors: Dict[int, CollectorBase] = {} + self.collectors: Dict[int, AsyncCollectorBase] = {} self.adb_client = adb_client self.logger = logger self.async_tasks: dict[int, asyncio.Task] = {} diff --git a/collector_manager/CollectorBase.py b/collector_manager/CollectorBase.py deleted file mode 100644 index 4fcb8f58..00000000 --- a/collector_manager/CollectorBase.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -Base class for all collectors -""" -import abc -import threading -import time -from abc import ABC -from typing import Optional, Type - -from pydantic import BaseModel - -from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo -from collector_db.DTOs.LogInfo import LogInfo -from collector_db.DatabaseClient import DatabaseClient -from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger -from core.enums import BatchStatus -from core.preprocessors.PreprocessorBase import PreprocessorBase - - -class CollectorAbortException(Exception): - pass - -class CollectorBase(ABC): - collector_type: CollectorType = None - preprocessor: Type[PreprocessorBase] = None - - def __init__( - self, - batch_id: int, - dto: BaseModel, - logger: CoreLogger, - db_client: DatabaseClient, - raise_error: bool = False, - ) -> None: - self.batch_id = batch_id - self.db_client = db_client - self.dto = dto - self.data: Optional[BaseModel] = None - self.logger = logger - self.status = BatchStatus.IN_PROCESS - self.start_time = None - self.compute_time = None - self.raise_error = raise_error - # # TODO: Determine how to update this in some of the other collectors - self._stop_event = threading.Event() - - @abc.abstractmethod - def run_implementation(self) -> None: - """ - This is the method that will be overridden by each collector - No other methods should be modified except for this one. - However, in each inherited class, new methods in addition to this one can be created - Returns: - - """ - raise NotImplementedError - - def start_timer(self) -> None: - self.start_time = time.time() - - def stop_timer(self) -> None: - self.compute_time = time.time() - self.start_time - - def handle_error(self, e: Exception) -> None: - if self.raise_error: - raise e - self.log(f"Error: {e}") - self.db_client.update_batch_post_collection( - batch_id=self.batch_id, - batch_status=self.status, - compute_time=self.compute_time, - total_url_count=0, - original_url_count=0, - duplicate_url_count=0 - ) - - def process(self) -> None: - self.log("Processing collector...", allow_abort=False) - preprocessor = self.preprocessor() - url_infos = preprocessor.preprocess(self.data) - self.log(f"URLs processed: {len(url_infos)}", allow_abort=False) - - self.log("Inserting URLs...", allow_abort=False) - insert_urls_info: InsertURLsInfo = self.db_client.insert_urls( - url_infos=url_infos, - batch_id=self.batch_id - ) - self.log("Updating batch...", allow_abort=False) - self.db_client.update_batch_post_collection( - batch_id=self.batch_id, - total_url_count=insert_urls_info.total_count, - duplicate_url_count=insert_urls_info.duplicate_count, - original_url_count=insert_urls_info.original_count, - batch_status=self.status, - compute_time=self.compute_time - ) - self.log("Done processing collector.", allow_abort=False) - - - def run(self) -> None: - try: - self.start_timer() - self.run_implementation() - self.stop_timer() - self.log("Collector completed successfully.") - self.close() - self.process() - except CollectorAbortException: - self.stop_timer() - self.status = BatchStatus.ABORTED - self.db_client.update_batch_post_collection( - batch_id=self.batch_id, - batch_status=BatchStatus.ABORTED, - compute_time=self.compute_time, - total_url_count=0, - original_url_count=0, - duplicate_url_count=0 - ) - except Exception as e: - self.stop_timer() - self.status = BatchStatus.ERROR - self.handle_error(e) - - def log(self, message: str, allow_abort = True) -> None: - if self._stop_event.is_set() and allow_abort: - raise CollectorAbortException - self.logger.log(LogInfo( - batch_id=self.batch_id, - log=message - )) - - def abort(self) -> None: - self._stop_event.set() # Signal the thread to stop - self.log("Collector was aborted.", allow_abort=False) - - def close(self) -> None: - self._stop_event.set() - self.status = BatchStatus.COMPLETE diff --git a/collector_manager/CollectorManager.py b/collector_manager/CollectorManager.py index e37b47c5..9fd5a428 100644 --- a/collector_manager/CollectorManager.py +++ b/collector_manager/CollectorManager.py @@ -3,109 +3,6 @@ Can start, stop, and get info on running collectors And manages the retrieval of collector info """ -import asyncio -import threading -from concurrent.futures import Future, ThreadPoolExecutor -from http import HTTPStatus -from typing import Dict, List - -from fastapi import HTTPException -from pydantic import BaseModel - -from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DatabaseClient import DatabaseClient -from collector_manager.CollectorBase import CollectorBase -from collector_manager.collector_mapping import COLLECTOR_MAPPING -from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger - class InvalidCollectorError(Exception): pass - -# Collector Manager Class -class CollectorManager: - def __init__( - self, - logger: CoreLogger, - db_client: DatabaseClient, - dev_mode: bool = False, - max_workers: int = 10 # Limit the number of concurrent threads - ): - self.collectors: Dict[int, CollectorBase] = {} - self.futures: Dict[int, Future] = {} - self.threads: Dict[int, threading.Thread] = {} - self.db_client = db_client - self.logger = logger - self.lock = threading.Lock() - self.max_workers = max_workers - self.dev_mode = dev_mode - self.executor = ThreadPoolExecutor(max_workers=self.max_workers) - - async def has_collector(self, cid: int) -> bool: - return cid in self.collectors - - - def restart_executor(self): - self.executor = ThreadPoolExecutor(max_workers=self.max_workers) - - def start_collector( - self, - collector_type: CollectorType, - batch_id: int, - dto: BaseModel - ) -> None: - with self.lock: - # If executor is shutdown, restart it - if self.executor._shutdown: - self.restart_executor() - - if batch_id in self.collectors: - raise ValueError(f"Collector with batch_id {batch_id} is already running.") - try: - collector_class = COLLECTOR_MAPPING[collector_type] - collector = collector_class( - batch_id=batch_id, - dto=dto, - logger=self.logger, - db_client=self.db_client, - raise_error=True if self.dev_mode else False - ) - except KeyError: - raise InvalidCollectorError(f"Collector {collector_type.value} not found.") - self.collectors[batch_id] = collector - - future = self.executor.submit(collector.run) - self.futures[batch_id] = future - - def try_getting_collector(self, cid): - collector = self.collectors.get(cid) - if collector is None: - raise InvalidCollectorError(f"Collector with CID {cid} not found.") - return collector - - def abort_collector(self, cid: int) -> None: - collector = self.try_getting_collector(cid) - - # Get collector thread - thread = self.threads.get(cid) - future = self.futures.get(cid) - collector.abort() - # thread.join(timeout=1) - self.collectors.pop(cid) - self.futures.pop(cid) - # self.threads.pop(cid) - - def shutdown_all_collectors(self) -> None: - with self.lock: - for cid, future in self.futures.items(): - if future.done(): - try: - future.result() - except Exception as e: - raise e - self.collectors[cid].abort() - - self.executor.shutdown(wait=True) - self.collectors.clear() - self.futures.clear() \ No newline at end of file diff --git a/collector_manager/constants.py b/collector_manager/constants.py index 444fad06..fde231d9 100644 --- a/collector_manager/constants.py +++ b/collector_manager/constants.py @@ -2,13 +2,10 @@ ASYNC_COLLECTORS = [ CollectorType.AUTO_GOOGLER, - CollectorType.EXAMPLE -] - -SYNC_COLLECTORS = [ + CollectorType.EXAMPLE, + CollectorType.CKAN, + CollectorType.COMMON_CRAWLER, CollectorType.MUCKROCK_SIMPLE_SEARCH, CollectorType.MUCKROCK_COUNTY_SEARCH, CollectorType.MUCKROCK_ALL_SEARCH, - CollectorType.CKAN, - CollectorType.COMMON_CRAWLER, -] \ No newline at end of file +] diff --git a/core/AsyncCore.py b/core/AsyncCore.py index c7626111..0b24e061 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -11,7 +11,6 @@ from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType from collector_manager.AsyncCollectorManager import AsyncCollectorManager -from collector_manager.CollectorManager import CollectorManager from collector_manager.constants import ASYNC_COLLECTORS from collector_manager.enums import CollectorType from core.DTOs.CollectorStartInfo import CollectorStartInfo diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index 585bcb52..a0bb34fc 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -1,18 +1,12 @@ -from typing import Optional +from typing import Optional, Any -from pydantic import BaseModel -from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DatabaseClient import DatabaseClient -from collector_manager.CollectorManager import CollectorManager from collector_manager.enums import CollectorType from core.CoreLogger import CoreLogger -from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse -from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse -from core.DTOs.MessageResponse import MessageResponse from core.ScheduledTaskManager import ScheduledTaskManager from core.enums import BatchStatus @@ -21,13 +15,12 @@ class SourceCollectorCore: def __init__( self, core_logger: CoreLogger, - collector_manager: CollectorManager, + collector_manager: Optional[Any] = None, db_client: DatabaseClient = DatabaseClient(), dev_mode: bool = False ): self.db_client = db_client self.core_logger = core_logger - self.collector_manager = collector_manager if not dev_mode: self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) else: @@ -53,50 +46,11 @@ def get_batch_statuses( def get_status(self, batch_id: int) -> BatchStatus: return self.db_client.get_batch_status(batch_id) - def initiate_collector( - self, - collector_type: CollectorType, - user_id: int, - dto: Optional[BaseModel] = None, - ): - """ - Reserves a batch ID from the database - and starts the requisite collector - """ - batch_info = BatchInfo( - strategy=collector_type.value, - status=BatchStatus.IN_PROCESS, - parameters=dto.model_dump(), - user_id=user_id - ) - batch_id = self.db_client.insert_batch(batch_info) - self.collector_manager.start_collector( - collector_type=collector_type, - batch_id=batch_id, - dto=dto - ) - return CollectorStartInfo( - batch_id=batch_id, - message=f"Started {collector_type.value} collector." - ) - def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse: logs = self.db_client.get_logs_by_batch_id(batch_id) return GetBatchLogsResponse(logs=logs) - def abort_batch(self, batch_id: int) -> MessageResponse: - self.collector_manager.abort_collector(cid=batch_id) - return MessageResponse(message=f"Batch aborted.") - - def restart(self): - self.collector_manager.shutdown_all_collectors() - self.collector_manager.restart_executor() - self.collector_manager.logger.restart() - - def shutdown(self): - self.collector_manager.shutdown_all_collectors() - self.collector_manager.logger.shutdown() if self.scheduled_task_manager is not None: self.scheduled_task_manager.shutdown() diff --git a/source_collectors/common_crawler/CommonCrawler.py b/source_collectors/common_crawler/CommonCrawler.py index 2bd2143c..db683611 100644 --- a/source_collectors/common_crawler/CommonCrawler.py +++ b/source_collectors/common_crawler/CommonCrawler.py @@ -35,11 +35,11 @@ async def async_make_request( return None -def make_request( +async def make_request( search_url: 'URLWithParameters' ) -> Union[aiohttp.ClientResponse, None]: """Synchronous wrapper around the async function.""" - return asyncio.run(async_make_request(search_url)) + return await async_make_request(search_url) def process_response(response, url: str, page: int) -> Union[list[str], None]: @@ -64,12 +64,12 @@ def process_response(response, url: str, page: int) -> Union[list[str], None]: return None -def get_common_crawl_search_results( +async def get_common_crawl_search_results( search_url: 'URLWithParameters', query_url: str, page: int ) -> Union[list[str], None]: - response = make_request(search_url) + response = await make_request(search_url) return process_response(response, query_url, page) @@ -100,10 +100,10 @@ def __init__( self.num_pages = num_pages self.url_results = None - def run(self): + async def run(self): url_results = [] for page in range(self.start_page, self.start_page + self.num_pages): - urls = self.search_common_crawl_index(query_url=self.url, page=page) + urls = await self.search_common_crawl_index(query_url=self.url, page=page) # If records were found, filter them and add to results if not urls: @@ -121,7 +121,7 @@ def run(self): self.url_results = url_results - def search_common_crawl_index( + async def search_common_crawl_index( self, query_url: str, page: int = 0, max_retries: int = 20 ) -> list[str] or None: """ @@ -144,7 +144,7 @@ def search_common_crawl_index( # put HTTP GET request in re-try loop in case of rate limiting. Once per second is nice enough per common crawl doc. while retries < max_retries: - results = get_common_crawl_search_results( + results = await get_common_crawl_search_results( search_url=search_url, query_url=query_url, page=page) if results is not None: return results diff --git a/source_collectors/common_crawler/CommonCrawlerCollector.py b/source_collectors/common_crawler/CommonCrawlerCollector.py index 71365680..eb28d545 100644 --- a/source_collectors/common_crawler/CommonCrawlerCollector.py +++ b/source_collectors/common_crawler/CommonCrawlerCollector.py @@ -1,15 +1,15 @@ -from collector_manager.CollectorBase import CollectorBase +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.enums import CollectorType from core.preprocessors.CommonCrawlerPreprocessor import CommonCrawlerPreprocessor from source_collectors.common_crawler.CommonCrawler import CommonCrawler from source_collectors.common_crawler.DTOs import CommonCrawlerInputDTO -class CommonCrawlerCollector(CollectorBase): +class CommonCrawlerCollector(AsyncCollectorBase): collector_type = CollectorType.COMMON_CRAWLER preprocessor = CommonCrawlerPreprocessor - def run_implementation(self) -> None: + async def run_implementation(self) -> None: print("Running Common Crawler...") dto: CommonCrawlerInputDTO = self.dto common_crawler = CommonCrawler( @@ -17,9 +17,9 @@ def run_implementation(self) -> None: url=dto.url, keyword=dto.search_term, start_page=dto.start_page, - num_pages=dto.total_pages + num_pages=dto.total_pages, ) - for status in common_crawler.run(): - self.log(status) + async for status in common_crawler.run(): + await self.log(status) self.data = {"urls": common_crawler.url_results} \ No newline at end of file diff --git a/source_collectors/muckrock/classes/FOIASearcher.py b/source_collectors/muckrock/classes/FOIASearcher.py index b4d3abaa..cb3af7e8 100644 --- a/source_collectors/muckrock/classes/FOIASearcher.py +++ b/source_collectors/muckrock/classes/FOIASearcher.py @@ -17,11 +17,11 @@ def __init__(self, fetcher: FOIAFetcher, search_term: Optional[str] = None): self.fetcher = fetcher self.search_term = search_term - def fetch_page(self) -> list[dict] | None: + async def fetch_page(self) -> list[dict] | None: """ Fetches the next page of results using the fetcher. """ - data = self.fetcher.fetch_next_page() + data = await self.fetcher.fetch_next_page() if data is None or data.get("results") is None: return None return data.get("results") @@ -43,7 +43,7 @@ def update_progress(self, pbar: tqdm, results: list[dict]) -> int: pbar.update(num_results) return num_results - def search_to_count(self, max_count: int) -> list[dict]: + async def search_to_count(self, max_count: int) -> list[dict]: """ Fetches and processes results up to a maximum count. """ @@ -52,7 +52,7 @@ def search_to_count(self, max_count: int) -> list[dict]: with tqdm(total=max_count, desc="Fetching results", unit="result") as pbar: while count > 0: try: - results = self.get_next_page_results() + results = await self.get_next_page_results() except SearchCompleteException: break @@ -61,11 +61,11 @@ def search_to_count(self, max_count: int) -> list[dict]: return all_results - def get_next_page_results(self) -> list[dict]: + async def get_next_page_results(self) -> list[dict]: """ Fetches and processes the next page of results. """ - results = self.fetch_page() + results = await self.fetch_page() if not results: raise SearchCompleteException return self.filter_results(results) diff --git a/source_collectors/muckrock/classes/MuckrockCollector.py b/source_collectors/muckrock/classes/MuckrockCollector.py index 8924b116..885c0369 100644 --- a/source_collectors/muckrock/classes/MuckrockCollector.py +++ b/source_collectors/muckrock/classes/MuckrockCollector.py @@ -1,6 +1,6 @@ import itertools -from collector_manager.CollectorBase import CollectorBase +from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.enums import CollectorType from core.preprocessors.MuckrockPreprocessor import MuckrockPreprocessor from source_collectors.muckrock.DTOs import MuckrockAllFOIARequestsCollectorInputDTO, \ @@ -15,7 +15,7 @@ from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError -class MuckrockSimpleSearchCollector(CollectorBase): +class MuckrockSimpleSearchCollector(AsyncCollectorBase): """ Performs searches on MuckRock's database by matching a search string to title of request @@ -29,7 +29,7 @@ def check_for_count_break(self, count, max_count) -> None: if count >= max_count: raise SearchCompleteException - def run_implementation(self) -> None: + async def run_implementation(self) -> None: fetcher = FOIAFetcher() dto: MuckrockSimpleSearchCollectorInputDTO = self.dto searcher = FOIASearcher( @@ -41,7 +41,7 @@ def run_implementation(self) -> None: results_count = 0 for search_count in itertools.count(): try: - results = searcher.get_next_page_results() + results = await searcher.get_next_page_results() all_results.extend(results) results_count += len(results) self.check_for_count_break(results_count, max_count) @@ -64,19 +64,19 @@ def format_results(self, results: list[dict]) -> list[dict]: return formatted_results -class MuckrockCountyLevelSearchCollector(CollectorBase): +class MuckrockCountyLevelSearchCollector(AsyncCollectorBase): """ Searches for any and all requests in a certain county """ collector_type = CollectorType.MUCKROCK_COUNTY_SEARCH preprocessor = MuckrockPreprocessor - def run_implementation(self) -> None: - jurisdiction_ids = self.get_jurisdiction_ids() + async def run_implementation(self) -> None: + jurisdiction_ids = await self.get_jurisdiction_ids() if jurisdiction_ids is None: - self.log("No jurisdictions found") + await self.log("No jurisdictions found") return - all_data = self.get_foia_records(jurisdiction_ids) + all_data = await self.get_foia_records(jurisdiction_ids) formatted_data = self.format_data(all_data) self.data = {"urls": formatted_data} @@ -89,19 +89,17 @@ def format_data(self, all_data): }) return formatted_data - def get_foia_records(self, jurisdiction_ids): - # TODO: Mock results here and test separately + async def get_foia_records(self, jurisdiction_ids): all_data = [] for name, id_ in jurisdiction_ids.items(): - self.log(f"Fetching records for {name}...") + await self.log(f"Fetching records for {name}...") request = FOIALoopFetchRequest(jurisdiction=id_) fetcher = FOIALoopFetcher(request) - fetcher.loop_fetch() + await fetcher.loop_fetch() all_data.extend(fetcher.ffm.results) return all_data - def get_jurisdiction_ids(self): - # TODO: Mock results here and test separately + async def get_jurisdiction_ids(self): dto: MuckrockCountySearchCollectorInputDTO = self.dto parent_jurisdiction_id = dto.parent_jurisdiction_id request = JurisdictionLoopFetchRequest( @@ -110,40 +108,39 @@ def get_jurisdiction_ids(self): town_names=dto.town_names ) fetcher = JurisdictionGeneratorFetcher(initial_request=request) - for message in fetcher.generator_fetch(): - self.log(message) + async for message in fetcher.generator_fetch(): + await self.log(message) jurisdiction_ids = fetcher.jfm.jurisdictions return jurisdiction_ids -class MuckrockAllFOIARequestsCollector(CollectorBase): +class MuckrockAllFOIARequestsCollector(AsyncCollectorBase): """ Retrieves urls associated with all Muckrock FOIA requests """ collector_type = CollectorType.MUCKROCK_ALL_SEARCH preprocessor = MuckrockPreprocessor - def run_implementation(self) -> None: + async def run_implementation(self) -> None: dto: MuckrockAllFOIARequestsCollectorInputDTO = self.dto start_page = dto.start_page fetcher = FOIAFetcher( start_page=start_page, ) total_pages = dto.total_pages - all_page_data = self.get_page_data(fetcher, start_page, total_pages) + all_page_data = await self.get_page_data(fetcher, start_page, total_pages) all_transformed_data = self.transform_data(all_page_data) self.data = {"urls": all_transformed_data} - def get_page_data(self, fetcher, start_page, total_pages): - # TODO: Mock results here and test separately + async def get_page_data(self, fetcher, start_page, total_pages): all_page_data = [] for page in range(start_page, start_page + total_pages): - self.log(f"Fetching page {fetcher.current_page}") + await self.log(f"Fetching page {fetcher.current_page}") try: - page_data = fetcher.fetch_next_page() + page_data = await fetcher.fetch_next_page() except MuckrockNoMoreDataError: - self.log(f"No more data to fetch at page {fetcher.current_page}") + await self.log(f"No more data to fetch at page {fetcher.current_page}") break if page_data is None: continue diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py index d3e7364a..e73180df 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py @@ -11,5 +11,5 @@ class AgencyFetcher(MuckrockFetcher): def build_url(self, request: AgencyFetchRequest) -> str: return f"{BASE_MUCKROCK_URL}/agency/{request.agency_id}/" - def get_agency(self, agency_id: int): - return self.fetch(AgencyFetchRequest(agency_id=agency_id)) \ No newline at end of file + async def get_agency(self, agency_id: int): + return await self.fetch(AgencyFetchRequest(agency_id=agency_id)) \ No newline at end of file diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py index 526698b7..3a057864 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py @@ -30,12 +30,12 @@ def __init__(self, start_page: int = 1, per_page: int = 100): def build_url(self, request: FOIAFetchRequest) -> str: return f"{FOIA_BASE_URL}?page={request.page}&page_size={request.page_size}&format=json" - def fetch_next_page(self) -> dict | None: + async def fetch_next_page(self) -> dict | None: """ Fetches data from a specific page of the MuckRock FOIA API. """ page = self.current_page self.current_page += 1 request = FOIAFetchRequest(page=page, page_size=self.per_page) - return self.fetch(request) + return await self.fetch(request) diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py index c8c467a1..08db97dd 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py @@ -11,5 +11,5 @@ class JurisdictionByIDFetcher(MuckrockFetcher): def build_url(self, request: JurisdictionByIDFetchRequest) -> str: return f"{BASE_MUCKROCK_URL}/jurisdiction/{request.jurisdiction_id}/" - def get_jurisdiction(self, jurisdiction_id: int) -> dict: - return self.fetch(request=JurisdictionByIDFetchRequest(jurisdiction_id=jurisdiction_id)) + async def get_jurisdiction(self, jurisdiction_id: int) -> dict: + return await self.fetch(request=JurisdictionByIDFetchRequest(jurisdiction_id=jurisdiction_id)) diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py index 466478c7..c1a6eecb 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py @@ -22,10 +22,10 @@ async def get_async_request(self, url: str) -> dict | None: response.raise_for_status() return await response.json() - def fetch(self, request: FetchRequest) -> dict | None: + async def fetch(self, request: FetchRequest) -> dict | None: url = self.build_url(request) try: - return asyncio.run(self.get_async_request(url)) + return await self.get_async_request(url) except requests.exceptions.HTTPError as e: print(f"Failed to get records on request `{url}`: {e}") # If code is 404, raise NoMoreData error diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py index 7e5105d7..67253034 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockIterFetcherBase.py @@ -19,9 +19,9 @@ async def get_response_async(self, url) -> dict: response.raise_for_status() return await response.json() - def get_response(self, url) -> dict: + async def get_response(self, url) -> dict: try: - return asyncio.run(self.get_response_async(url)) + return await self.get_response_async(url) except requests.exceptions.HTTPError as e: print(f"Failed to get records on request `{url}`: {e}") raise RequestFailureException diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py index 3558b7cd..2e4814a5 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py @@ -7,11 +7,11 @@ class MuckrockLoopFetcher(MuckrockIterFetcherBase): - def loop_fetch(self): + async def loop_fetch(self): url = self.build_url(self.initial_request) while url is not None: try: - data = self.get_response(url) + data = await self.get_response(url) except RequestFailureException: break diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockNextFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockNextFetcher.py index 7c5fd359..889e8446 100644 --- a/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockNextFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockNextFetcher.py @@ -8,7 +8,7 @@ class MuckrockGeneratorFetcher(MuckrockIterFetcherBase): as a generator instead of a loop """ - def generator_fetch(self) -> str: + async def generator_fetch(self) -> str: """ Fetches data and yields status messages between requests """ @@ -16,7 +16,7 @@ def generator_fetch(self) -> str: final_message = "No more records found. Exiting..." while url is not None: try: - data = self.get_response(url) + data = await self.get_response(url) except RequestFailureException: final_message = "Request unexpectedly failed. Exiting..." break diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 3cb884c0..94e0034f 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -67,22 +67,22 @@ def keys(self) -> list[str]: return list(self.model_dump().keys()) -def main(): +async def main(): json_filename = get_json_filename() json_data = load_json_file(json_filename) output_csv = format_filename_json_to_csv(json_filename) - agency_infos = get_agency_infos(json_data) + agency_infos = await get_agency_infos(json_data) write_to_csv(agency_infos, output_csv) -def get_agency_infos(json_data): +async def get_agency_infos(json_data): a_fetcher = AgencyFetcher() j_fetcher = JurisdictionByIDFetcher() agency_infos = [] # Iterate through the JSON data for item in json_data: print(f"Writing data for {item.get('title')}") - agency_data = a_fetcher.get_agency(agency_id=item.get("agency")) + agency_data = await a_fetcher.get_agency(agency_id=item.get("agency")) time.sleep(1) jurisdiction_data = j_fetcher.get_jurisdiction( jurisdiction_id=agency_data.get("jurisdiction") diff --git a/tests/conftest.py b/tests/conftest.py index 7cc4291c..fbe5dd50 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base @@ -63,6 +64,13 @@ def db_client_test(wipe_database) -> DatabaseClient: yield db_client db_client.engine.dispose() +@pytest.fixture +def adb_client_test(wipe_database) -> AsyncDatabaseClient: + conn = get_postgres_connection_string() + adb_client = AsyncDatabaseClient(db_url=conn) + yield adb_client + adb_client.engine.dispose() + @pytest.fixture def db_data_creator(db_client_test): db_data_creator = DBDataCreator(db_client=db_client_test) diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 9f9719a7..6964fb86 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,3 +1,4 @@ +import asyncio from random import randint from typing import List, Optional @@ -10,9 +11,8 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DatabaseClient import DatabaseClient -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType +from collector_db.enums import TaskType from collector_manager.enums import CollectorType, URLStatus from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO @@ -190,10 +190,10 @@ def urls( ) ) - return self.db_client.insert_urls( + return asyncio.run(self.adb_client.insert_urls( url_infos=url_infos, batch_id=batch_id, - ) + )) async def url_miscellaneous_metadata( self, @@ -282,17 +282,24 @@ async def agency_auto_suggestions( if suggestion_type == SuggestionType.UNKNOWN: count = 1 # Can only be one auto suggestion if unknown - await self.adb_client.add_agency_auto_suggestions( - suggestions=[ - URLAgencySuggestionInfo( + suggestions = [] + for _ in range(count): + if suggestion_type == SuggestionType.UNKNOWN: + pdap_agency_id = None + else: + pdap_agency_id = await self.agency() + suggestion = URLAgencySuggestionInfo( url_id=url_id, suggestion_type=suggestion_type, - pdap_agency_id=None if suggestion_type == SuggestionType.UNKNOWN else await self.agency(), + pdap_agency_id=pdap_agency_id, state="Test State", county="Test County", locality="Test Locality" - ) for _ in range(count) - ] + ) + suggestions.append(suggestion) + + await self.adb_client.add_agency_auto_suggestions( + suggestions=suggestions ) async def agency_confirmed_suggestion( diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 7018d5aa..8f1fc630 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -56,15 +56,15 @@ async def test_url_html_cycle( db_data_creator: DBDataCreator ): batch_id = db_data_creator.batch() - db_client = db_data_creator.db_client + adb_client: AsyncDatabaseClient = db_data_creator.adb_client url_infos = [] for url in URLS: url_infos.append(URLInfo(url=url)) - db_client.insert_urls(url_infos=url_infos, batch_id=batch_id) + await adb_client.insert_urls(url_infos=url_infos, batch_id=batch_id) operator = URLHTMLTaskOperator( - adb_client=AsyncDatabaseClient(), + adb_client=adb_client, url_request_interface=URLRequestInterface(), html_parser=HTMLResponseParser( root_url_cache=RootURLCache() diff --git a/tests/manual/source_collectors/test_autogoogler_collector.py b/tests/manual/source_collectors/test_autogoogler_collector.py index e2c2b8e1..78fc46d7 100644 --- a/tests/manual/source_collectors/test_autogoogler_collector.py +++ b/tests/manual/source_collectors/test_autogoogler_collector.py @@ -1,12 +1,15 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock +import pytest + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger from source_collectors.auto_googler.AutoGooglerCollector import AutoGooglerCollector from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO - -def test_autogoogler_collector(): +@pytest.mark.asyncio +async def test_autogoogler_collector(): collector = AutoGooglerCollector( batch_id=1, dto=AutoGooglerInputDTO( @@ -14,8 +17,8 @@ def test_autogoogler_collector(): queries=["police"], ), logger = MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() print(collector.data) \ No newline at end of file diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index 53fb711d..3bae5d88 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -34,14 +34,15 @@ async def test_ckan_collector_default(): logger=MagicMock(spec=CoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True - ) await collector.run() schema = CKANSchema(many=True) schema.load(collector.data["results"]) - print(collector.data) + print("Printing results") + print(collector.data["results"]) -def test_ckan_collector_custom(): +@pytest.mark.asyncio +async def test_ckan_collector_custom(): """ Use this to test how CKAN behaves when using something other than the default options provided @@ -80,9 +81,9 @@ def test_ckan_collector_custom(): } ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() schema = CKANSchema(many=True) schema.load(collector.data["results"]) \ No newline at end of file diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index 65ec778d..6c9771f3 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -1,7 +1,9 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock +import pytest from marshmallow import Schema, fields +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger from source_collectors.common_crawler.CommonCrawlerCollector import CommonCrawlerCollector @@ -11,13 +13,15 @@ class CommonCrawlerSchema(Schema): urls = fields.List(fields.String()) -def test_common_crawler_collector(): +@pytest.mark.asyncio +async def test_common_crawler_collector(): collector = CommonCrawlerCollector( batch_id=1, dto=CommonCrawlerInputDTO(), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient) + adb_client=AsyncMock(spec=AsyncDatabaseClient), + raise_error=True ) - collector.run() + await collector.run() print(collector.data) CommonCrawlerSchema().load(collector.data) diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index 4689dbab..8fb80bc4 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -1,16 +1,20 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock -from collector_db.DatabaseClient import DatabaseClient +import pytest + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from core.CoreLogger import CoreLogger from source_collectors.muckrock.DTOs import MuckrockSimpleSearchCollectorInputDTO, \ MuckrockCountySearchCollectorInputDTO, MuckrockAllFOIARequestsCollectorInputDTO from source_collectors.muckrock.classes.MuckrockCollector import MuckrockSimpleSearchCollector, \ MuckrockCountyLevelSearchCollector, MuckrockAllFOIARequestsCollector from source_collectors.muckrock.schemas import MuckrockURLInfoSchema -from test_automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, ALLEGHENY_COUNTY_TOWN_NAMES +from tests.test_automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, \ + ALLEGHENY_COUNTY_TOWN_NAMES -def test_muckrock_simple_search_collector(): +@pytest.mark.asyncio +async def test_muckrock_simple_search_collector(): collector = MuckrockSimpleSearchCollector( batch_id=1, @@ -19,16 +23,18 @@ def test_muckrock_simple_search_collector(): max_results=10 ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() schema = MuckrockURLInfoSchema(many=True) schema.load(collector.data["urls"]) assert len(collector.data["urls"]) >= 10 + print(collector.data) -def test_muckrock_county_level_search_collector(): +@pytest.mark.asyncio +async def test_muckrock_county_level_search_collector(): collector = MuckrockCountyLevelSearchCollector( batch_id=1, dto=MuckrockCountySearchCollectorInputDTO( @@ -36,16 +42,19 @@ def test_muckrock_county_level_search_collector(): town_names=ALLEGHENY_COUNTY_TOWN_NAMES ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient) + adb_client=AsyncMock(spec=AsyncDatabaseClient), + raise_error=True ) - collector.run() + await collector.run() schema = MuckrockURLInfoSchema(many=True) schema.load(collector.data["urls"]) assert len(collector.data["urls"]) >= 10 + print(collector.data) -def test_muckrock_full_search_collector(): +@pytest.mark.asyncio +async def test_muckrock_full_search_collector(): collector = MuckrockAllFOIARequestsCollector( batch_id=1, dto=MuckrockAllFOIARequestsCollectorInputDTO( @@ -53,9 +62,11 @@ def test_muckrock_full_search_collector(): total_pages=2 ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient) + adb_client=AsyncMock(spec=AsyncDatabaseClient), + raise_error=True ) - collector.run() + await collector.run() assert len(collector.data["urls"]) >= 1 schema = MuckrockURLInfoSchema(many=True) - schema.load(collector.data["urls"]) \ No newline at end of file + schema.load(collector.data["urls"]) + print(collector.data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index c31676b6..81207a28 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -58,7 +58,7 @@ def test_example_collector(api_test_helper): assert bi.user_id is not None # Flush early to ensure logs are written - ath.core.collector_manager.logger.flush_all() + ath.core.core_logger.flush_all() lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 6090aaf1..c78bf57e 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -18,8 +18,11 @@ from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.complex_test_data_functions import setup_for_get_next_url_for_final_review - -def test_insert_urls(db_client_test): +@pytest.mark.asyncio +async def test_insert_urls( + db_client_test, + adb_client_test +): # Insert batch batch_info = BatchInfo( strategy="ckan", @@ -43,7 +46,7 @@ def test_insert_urls(db_client_test): collector_metadata={"name": "example_duplicate"}, ) ] - insert_urls_info = db_client_test.insert_urls( + insert_urls_info = await adb_client_test.insert_urls( url_infos=urls, batch_id=batch_id ) diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index 4377fd76..8ffdc266 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -4,7 +4,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_manager.AsyncCollectorManager import AsyncCollectorManager -from collector_manager.CollectorManager import CollectorManager from core.AsyncCore import AsyncCore from core.CoreLogger import CoreLogger from core.SourceCollectorCore import SourceCollectorCore @@ -17,10 +16,6 @@ def test_core(db_client_test): ) as logger: core = SourceCollectorCore( db_client=db_client_test, - collector_manager=CollectorManager( - db_client=db_client_test, - logger=logger - ), core_logger=logger, dev_mode=True ) diff --git a/tests/test_automated/integration/core/helpers/common_test_procedures.py b/tests/test_automated/integration/core/helpers/common_test_procedures.py deleted file mode 100644 index d60c59d2..00000000 --- a/tests/test_automated/integration/core/helpers/common_test_procedures.py +++ /dev/null @@ -1,27 +0,0 @@ -import time - -from pydantic import BaseModel - -from collector_manager.enums import CollectorType -from core.SourceCollectorCore import SourceCollectorCore - - -def run_collector_and_wait_for_completion( - collector_type: CollectorType, - core: SourceCollectorCore, - dto: BaseModel -): - collector_name = collector_type.value - response = core.initiate_collector( - collector_type=collector_type, - dto=dto - ) - assert response == f"Started {collector_name} collector with CID: 1" - response = core.get_status(1) - while response == f"1 ({collector_name}) - RUNNING": - time.sleep(1) - response = core.get_status(1) - assert response == f"1 ({collector_name}) - COMPLETED", response - # TODO: Change this logic, since collectors close automatically - response = core.close_collector(1) - assert response.message == "Collector closed and data harvested successfully." diff --git a/tests/test_automated/unit/source_collectors/test_ckan_collector.py b/tests/test_automated/unit/source_collectors/test_ckan_collector.py index 21f469dc..b00ed434 100644 --- a/tests/test_automated/unit/source_collectors/test_ckan_collector.py +++ b/tests/test_automated/unit/source_collectors/test_ckan_collector.py @@ -1,9 +1,10 @@ import json import pickle -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector @@ -12,13 +13,13 @@ @pytest.fixture def mock_ckan_collector_methods(monkeypatch): - mock = MagicMock() + mock = AsyncMock() mock_path = "source_collectors.ckan.CKANCollector.CKANCollector.get_results" with open("tests/test_data/ckan_get_result_test_data.json", "r", encoding="utf-8") as f: data = json.load(f) - mock.get_results = MagicMock() + mock.get_results = AsyncMock() mock.get_results.return_value = data monkeypatch.setattr(mock_path, mock.get_results) @@ -26,7 +27,7 @@ def mock_ckan_collector_methods(monkeypatch): with open("tests/test_data/ckan_add_collection_child_packages.pkl", "rb") as f: data = pickle.load(f) - mock.add_collection_child_packages = MagicMock() + mock.add_collection_child_packages = AsyncMock() mock.add_collection_child_packages.return_value = data monkeypatch.setattr(mock_path, mock.add_collection_child_packages) @@ -34,23 +35,24 @@ def mock_ckan_collector_methods(monkeypatch): yield mock -def test_ckan_collector(mock_ckan_collector_methods): +@pytest.mark.asyncio +async def test_ckan_collector(mock_ckan_collector_methods): mock = mock_ckan_collector_methods collector = CKANCollector( batch_id=1, dto=CKANInputDTO(), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() mock.get_results.assert_called_once() mock.add_collection_child_packages.assert_called_once() - collector.db_client.insert_urls.assert_called_once() - url_infos = collector.db_client.insert_urls.call_args[1]['url_infos'] + collector.adb_client.insert_urls.assert_called_once() + url_infos = collector.adb_client.insert_urls.call_args[1]['url_infos'] assert len(url_infos) == 2560 first_url_info = url_infos[0] assert first_url_info.url == 'https://catalog.data.gov/dataset/crash-reporting-drivers-data' diff --git a/tests/test_automated/unit/source_collectors/test_collector_closes_properly.py b/tests/test_automated/unit/source_collectors/test_collector_closes_properly.py deleted file mode 100644 index 386120a8..00000000 --- a/tests/test_automated/unit/source_collectors/test_collector_closes_properly.py +++ /dev/null @@ -1,71 +0,0 @@ -import threading -import time -from unittest.mock import Mock, MagicMock - -from collector_db.DTOs.LogInfo import LogInfo -from collector_db.DatabaseClient import DatabaseClient -from collector_manager.CollectorBase import CollectorBase -from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger -from core.enums import BatchStatus - - -# Mock a subclass to implement the abstract method -class MockCollector(CollectorBase): - collector_type = CollectorType.EXAMPLE - preprocessor = MagicMock() - - def __init__(self, dto, **kwargs): - super().__init__( - batch_id=1, - dto=dto, - logger=Mock(spec=CoreLogger), - db_client=Mock(spec=DatabaseClient), - raise_error=True - ) - - def run_implementation(self): - while True: - time.sleep(0.1) # Simulate work - self.log("Working...") - -def test_collector_closes_properly(): - # Mock dependencies - mock_dto = Mock() - - # Initialize the collector - collector = MockCollector( - dto=mock_dto, - ) - - # Run the collector in a separate thread - thread = threading.Thread(target=collector.run) - thread.start() - - # Run the collector for a time - time.sleep(1) - # Signal the collector to stop - collector.abort() - - thread.join() - - - - # Assertions - # Check that multiple log calls have been made - assert collector.logger.log.call_count > 1 - # Check that last call to collector.logger.log was with the correct message - assert collector.logger.log.call_args[0][0] == LogInfo( - id=None, - log='Collector was aborted.', - batch_id=1, - created_at=None - ) - - assert not thread.is_alive(), "Thread is still alive after aborting." - assert collector._stop_event.is_set(), "Stop event was not set." - assert collector.status == BatchStatus.ABORTED, "Collector status is not ABORTED." - - print("Test passed: Collector closes properly.") - - diff --git a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py index e0dbd144..74fe1052 100644 --- a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py @@ -2,6 +2,7 @@ import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger @@ -23,20 +24,21 @@ def mock_get_common_crawl_search_results(): mock_get_common_crawl_search_results.return_value = mock_results yield mock_get_common_crawl_search_results - -def test_common_crawl_collector(mock_get_common_crawl_search_results): +@pytest.mark.asyncio +async def test_common_crawl_collector(mock_get_common_crawl_search_results): collector = CommonCrawlerCollector( batch_id=1, dto=CommonCrawlerInputDTO( search_term="keyword", ), logger=mock.MagicMock(spec=CoreLogger), - db_client=mock.MagicMock(spec=DatabaseClient) + adb_client=mock.AsyncMock(spec=AsyncDatabaseClient), + raise_error=True ) - collector.run() + await collector.run() mock_get_common_crawl_search_results.assert_called_once() - collector.db_client.insert_urls.assert_called_once_with( + collector.adb_client.insert_urls.assert_called_once_with( url_infos=[ URLInfo(url="http://keyword.com"), URLInfo(url="http://keyword.com/page3") diff --git a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py index 7dbb92c5..f74c651e 100644 --- a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py @@ -1,8 +1,9 @@ from unittest import mock -from unittest.mock import MagicMock, call +from unittest.mock import MagicMock, call, AsyncMock import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient from core.CoreLogger import CoreLogger @@ -24,15 +25,15 @@ def patch_muckrock_fetcher(monkeypatch): test_data = { "results": inner_test_data } - mock = MagicMock() + mock = AsyncMock() mock.return_value = test_data monkeypatch.setattr(patch_path, mock) return mock - -def test_muckrock_simple_collector(patch_muckrock_fetcher): +@pytest.mark.asyncio +async def test_muckrock_simple_collector(patch_muckrock_fetcher): collector = MuckrockSimpleSearchCollector( batch_id=1, dto=MuckrockSimpleSearchCollectorInputDTO( @@ -40,16 +41,16 @@ def test_muckrock_simple_collector(patch_muckrock_fetcher): max_results=2 ), logger=mock.MagicMock(spec=CoreLogger), - db_client=mock.MagicMock(spec=DatabaseClient), + adb_client=mock.AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() patch_muckrock_fetcher.assert_has_calls( [ call(FOIAFetchRequest(page=1, page_size=100)), ] ) - collector.db_client.insert_urls.assert_called_once_with( + collector.adb_client.insert_urls.assert_called_once_with( url_infos=[ URLInfo( url='https://include.com/1', @@ -80,13 +81,14 @@ def patch_muckrock_county_level_search_collector_methods(monkeypatch): {"absolute_url": "https://include.com/3", "title": "lemon"}, ] mock = MagicMock() - mock.get_jurisdiction_ids = MagicMock(return_value=get_jurisdiction_ids_data) - mock.get_foia_records = MagicMock(return_value=get_foia_records_data) + mock.get_jurisdiction_ids = AsyncMock(return_value=get_jurisdiction_ids_data) + mock.get_foia_records = AsyncMock(return_value=get_foia_records_data) monkeypatch.setattr(patch_path_get_jurisdiction_ids, mock.get_jurisdiction_ids) monkeypatch.setattr(patch_path_get_foia_records, mock.get_foia_records) return mock -def test_muckrock_county_search_collector(patch_muckrock_county_level_search_collector_methods): +@pytest.mark.asyncio +async def test_muckrock_county_search_collector(patch_muckrock_county_level_search_collector_methods): mock_methods = patch_muckrock_county_level_search_collector_methods collector = MuckrockCountyLevelSearchCollector( @@ -96,15 +98,15 @@ def test_muckrock_county_search_collector(patch_muckrock_county_level_search_col town_names=["test"] ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() mock_methods.get_jurisdiction_ids.assert_called_once() mock_methods.get_foia_records.assert_called_once_with({"Alpha": 1, "Beta": 2}) - collector.db_client.insert_urls.assert_called_once_with( + collector.adb_client.insert_urls.assert_called_once_with( url_infos=[ URLInfo( url='https://include.com/1', @@ -142,9 +144,9 @@ def patch_muckrock_full_search_collector(monkeypatch): } ] }] - mock = MagicMock() + mock = AsyncMock() mock.return_value = test_data - mock.get_page_data = MagicMock(return_value=test_data) + mock.get_page_data = AsyncMock(return_value=test_data) monkeypatch.setattr(patch_path, mock.get_page_data) patch_path = ("source_collectors.muckrock.classes.MuckrockCollector." @@ -155,7 +157,8 @@ def patch_muckrock_full_search_collector(monkeypatch): return mock -def test_muckrock_all_foia_requests_collector(patch_muckrock_full_search_collector): +@pytest.mark.asyncio +async def test_muckrock_all_foia_requests_collector(patch_muckrock_full_search_collector): mock = patch_muckrock_full_search_collector collector = MuckrockAllFOIARequestsCollector( batch_id=1, @@ -164,14 +167,14 @@ def test_muckrock_all_foia_requests_collector(patch_muckrock_full_search_collect total_pages=2 ), logger=MagicMock(spec=CoreLogger), - db_client=MagicMock(spec=DatabaseClient), + adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) - collector.run() + await collector.run() mock.get_page_data.assert_called_once_with(mock.foia_fetcher.return_value, 1, 2) - collector.db_client.insert_urls.assert_called_once_with( + collector.adb_client.insert_urls.assert_called_once_with( url_infos=[ URLInfo( url='https://include.com/1', From 7bfd1e4f76182489c5d0199a5b527cfe616e3597 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 12 Apr 2025 17:41:25 -0400 Subject: [PATCH 096/182] DRAFT --- api/main.py | 18 +- collector_db/DatabaseClient.py | 58 ++++++ collector_manager/AsyncCollectorBase.py | 7 +- collector_manager/ExampleCollector.py | 1 - collector_manager/constants.py | 11 -- core/AsyncCore.py | 168 +++-------------- core/TaskManager.py | 177 ++++++++++++++++++ tests/conftest.py | 2 +- tests/helpers/DBDataCreator.py | 4 +- .../integration/api/test_example_collector.py | 2 + tests/test_automated/integration/conftest.py | 5 +- .../integration/core/test_async_core.py | 35 +--- 12 files changed, 288 insertions(+), 200 deletions(-) delete mode 100644 collector_manager/constants.py create mode 100644 core/TaskManager.py diff --git a/api/main.py b/api/main.py index 37521822..cc7e3fa2 100644 --- a/api/main.py +++ b/api/main.py @@ -18,6 +18,7 @@ from core.CoreLogger import CoreLogger from core.ScheduledTaskManager import AsyncScheduledTaskManager from core.SourceCollectorCore import SourceCollectorCore +from core.TaskManager import TaskManager from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache from html_tag_collector.URLRequestInterface import URLRequestInterface @@ -45,13 +46,16 @@ async def lifespan(app: FastAPI): ) async_core = AsyncCore( adb_client=adb_client, - huggingface_interface=HuggingFaceInterface(), - url_request_interface=URLRequestInterface(), - html_parser=HTMLResponseParser( - root_url_cache=RootURLCache() - ), - discord_poster=DiscordPoster( - webhook_url=get_from_env("DISCORD_WEBHOOK_URL") + task_manager=TaskManager( + adb_client=adb_client, + huggingface_interface=HuggingFaceInterface(), + url_request_interface=URLRequestInterface(), + html_parser=HTMLResponseParser( + root_url_cache=RootURLCache() + ), + discord_poster=DiscordPoster( + webhook_url=get_from_env("DISCORD_WEBHOOK_URL") + ), ), collector_manager=async_collector_manager ) diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 06107651..8d72ef0d 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -3,13 +3,16 @@ from typing import Optional, List from sqlalchemy import create_engine, Row +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, aliased from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.DuplicateInfo import DuplicateInfo, DuplicateInsertInfo +from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo from collector_db.DTOs.URLInfo import URLInfo +from collector_db.DTOs.URLMapping import URLMapping from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType @@ -90,6 +93,61 @@ def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]) session.add(duplicate) + @session_manager + def get_url_info_by_url(self, session, url: str) -> Optional[URLInfo]: + url = session.query(URL).filter_by(url=url).first() + return URLInfo(**url.__dict__) + + @session_manager + def insert_url(self, session, url_info: URLInfo) -> int: + """Insert a new URL into the database.""" + url_entry = URL( + batch_id=url_info.batch_id, + url=url_info.url, + collector_metadata=url_info.collector_metadata, + outcome=url_info.outcome.value + ) + session.add(url_entry) + session.commit() + session.refresh(url_entry) + return url_entry.id + + @session_manager + def add_duplicate_info(self, session, duplicate_infos: list[DuplicateInfo]): + # TODO: Add test for this method when testing CollectorDatabaseProcessor + for duplicate_info in duplicate_infos: + duplicate = Duplicate( + batch_id=duplicate_info.original_batch_id, + original_url_id=duplicate_info.original_url_id, + ) + session.add(duplicate) + + + def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: + url_mappings = [] + duplicates = [] + for url_info in url_infos: + url_info.batch_id = batch_id + try: + url_id = self.insert_url(url_info) + url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) + except IntegrityError: + orig_url_info = self.get_url_info_by_url(url_info.url) + duplicate_info = DuplicateInsertInfo( + duplicate_batch_id=batch_id, + original_url_id=orig_url_info.id + ) + duplicates.append(duplicate_info) + self.insert_duplicates(duplicates) + + return InsertURLsInfo( + url_mappings=url_mappings, + total_count=len(url_infos), + original_count=len(url_mappings), + duplicate_count=len(duplicates), + url_ids=[url_mapping.url_id for url_mapping in url_mappings] + ) + @session_manager def get_urls_by_batch(self, session, batch_id: int, page: int = 1) -> List[URLInfo]: """Retrieve all URLs associated with a batch.""" diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index 672d9d9c..e93f97fc 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -11,6 +11,7 @@ from collector_db.DTOs.LogInfo import LogInfo from collector_manager.enums import CollectorType from core.CoreLogger import CoreLogger +from core.TaskManager import TaskManager from core.enums import BatchStatus from core.preprocessors.PreprocessorBase import PreprocessorBase @@ -26,8 +27,12 @@ def __init__( dto: BaseModel, logger: CoreLogger, adb_client: AsyncDatabaseClient, - raise_error: bool = False + raise_error: bool = False, + trigger_followup_tasks: bool = False, + task_manager: TaskManager = None ) -> None: + self.trigger_followup_tasks = trigger_followup_tasks + self.task_manager = task_manager self.batch_id = batch_id self.adb_client = adb_client self.dto = dto diff --git a/collector_manager/ExampleCollector.py b/collector_manager/ExampleCollector.py index 2d54eced..9f451732 100644 --- a/collector_manager/ExampleCollector.py +++ b/collector_manager/ExampleCollector.py @@ -4,7 +4,6 @@ """ import asyncio -import time from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO diff --git a/collector_manager/constants.py b/collector_manager/constants.py deleted file mode 100644 index fde231d9..00000000 --- a/collector_manager/constants.py +++ /dev/null @@ -1,11 +0,0 @@ -from collector_manager.enums import CollectorType - -ASYNC_COLLECTORS = [ - CollectorType.AUTO_GOOGLER, - CollectorType.EXAMPLE, - CollectorType.CKAN, - CollectorType.COMMON_CRAWLER, - CollectorType.MUCKROCK_SIMPLE_SEARCH, - CollectorType.MUCKROCK_COUNTY_SEARCH, - CollectorType.MUCKROCK_ALL_SEARCH, -] diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 0b24e061..971cd03d 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,17 +1,11 @@ -import logging -from http import HTTPStatus -from http.client import HTTPException from typing import Optional from pydantic import BaseModel -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo -from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType from collector_manager.AsyncCollectorManager import AsyncCollectorManager -from collector_manager.constants import ASYNC_COLLECTORS from collector_manager.enums import CollectorType from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -23,45 +17,22 @@ from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.MessageResponse import MessageResponse -from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome -from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator -from core.classes.TaskOperatorBase import TaskOperatorBase -from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator -from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator -from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator -from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType -from html_tag_collector.ResponseParser import HTMLResponseParser -from html_tag_collector.URLRequestInterface import URLRequestInterface -from hugging_face.HuggingFaceInterface import HuggingFaceInterface -from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier -from pdap_api_client.AccessManager import AccessManager -from pdap_api_client.PDAPClient import PDAPClient from security_manager.SecurityManager import AccessInfo -from util.DiscordNotifier import DiscordPoster -from util.helper_functions import get_from_env -TASK_REPEAT_THRESHOLD = 20 class AsyncCore: def __init__( self, adb_client: AsyncDatabaseClient, - huggingface_interface: HuggingFaceInterface, - url_request_interface: URLRequestInterface, - html_parser: HTMLResponseParser, - discord_poster: DiscordPoster, - collector_manager: AsyncCollectorManager + collector_manager: AsyncCollectorManager, + task_manager: TaskManager ): + self.task_manager = task_manager self.adb_client = adb_client - self.huggingface_interface = huggingface_interface - self.url_request_interface = url_request_interface - self.html_parser = html_parser - self.logger = logging.getLogger(__name__) - self.logger.addHandler(logging.StreamHandler()) - self.logger.setLevel(logging.INFO) - self.discord_poster = discord_poster + self.collector_manager = collector_manager @@ -96,11 +67,6 @@ async def initiate_collector( Reserves a batch ID from the database and starts the requisite collector """ - if collector_type not in ASYNC_COLLECTORS: - raise HTTPException( - f"Collector type {collector_type} is not supported", - HTTPStatus.BAD_REQUEST - ) batch_info = BatchInfo( strategy=collector_type.value, @@ -122,117 +88,23 @@ async def initiate_collector( # endregion - - #region Task Operators - async def get_url_html_task_operator(self): - self.logger.info("Running URL HTML Task") - operator = URLHTMLTaskOperator( - adb_client=self.adb_client, - url_request_interface=self.url_request_interface, - html_parser=self.html_parser - ) - return operator - - async def get_url_relevance_huggingface_task_operator(self): - self.logger.info("Running URL Relevance Huggingface Task") - operator = URLRelevanceHuggingfaceTaskOperator( - adb_client=self.adb_client, - huggingface_interface=self.huggingface_interface - ) - return operator - - async def get_url_record_type_task_operator(self): - operator = URLRecordTypeTaskOperator( - adb_client=self.adb_client, - classifier=OpenAIRecordClassifier() - ) - return operator - - async def get_agency_identification_task_operator(self): - pdap_client = PDAPClient( - access_manager=AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), - api_key=get_from_env("PDAP_API_KEY"), - ), - ) - muckrock_api_interface = MuckrockAPIInterface() - operator = AgencyIdentificationTaskOperator( - adb_client=self.adb_client, - pdap_client=pdap_client, - muckrock_api_interface=muckrock_api_interface - ) - return operator - - async def get_url_miscellaneous_metadata_task_operator(self): - operator = URLMiscellaneousMetadataTaskOperator( - adb_client=self.adb_client - ) - return operator - - async def get_task_operators(self) -> list[TaskOperatorBase]: - return [ - await self.get_url_html_task_operator(), - await self.get_url_relevance_huggingface_task_operator(), - await self.get_url_record_type_task_operator(), - await self.get_agency_identification_task_operator(), - await self.get_url_miscellaneous_metadata_task_operator() - ] - - #endregion - - #region Tasks async def run_tasks(self): - operators = await self.get_task_operators() - count = 0 - for operator in operators: - - meets_prereq = await operator.meets_task_prerequisites() - while meets_prereq: - if count > TASK_REPEAT_THRESHOLD: - self.discord_poster.post_to_discord( - message=f"Task {operator.task_type.value} has been run" - f" more than {TASK_REPEAT_THRESHOLD} times in a row. " - f"Task loop terminated.") - break - task_id = await self.initiate_task_in_db(task_type=operator.task_type) - run_info: TaskOperatorRunInfo = await operator.run_task(task_id) - await self.conclude_task(run_info) - count += 1 - meets_prereq = await operator.meets_task_prerequisites() - - - async def conclude_task(self, run_info): - await self.adb_client.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) - await self.handle_outcome(run_info) - - async def initiate_task_in_db(self, task_type: TaskType) -> int: - self.logger.info(f"Initiating {task_type.value} Task") - task_id = await self.adb_client.initiate_task(task_type=task_type) - return task_id - - async def handle_outcome(self, run_info: TaskOperatorRunInfo): - match run_info.outcome: - case TaskOperatorOutcome.ERROR: - await self.handle_task_error(run_info) - case TaskOperatorOutcome.SUCCESS: - await self.adb_client.update_task_status( - task_id=run_info.task_id, - status=BatchStatus.COMPLETE - ) - - async def handle_task_error(self, run_info: TaskOperatorRunInfo): - await self.adb_client.update_task_status(task_id=run_info.task_id, status=BatchStatus.ERROR) - await self.adb_client.add_task_error(task_id=run_info.task_id, error=run_info.message) - - async def get_task_info(self, task_id: int) -> TaskInfo: - return await self.adb_client.get_task_info(task_id=task_id) - - async def get_tasks(self, page: int, task_type: TaskType, task_status: BatchStatus) -> GetTasksResponse: - return await self.adb_client.get_tasks(page=page, task_type=task_type, task_status=task_status) + await self.task_manager.run_tasks() + async def get_tasks( + self, + page: int, + task_type: TaskType, + task_status: BatchStatus + ) -> GetTasksResponse: + return await self.task_manager.get_tasks( + page=page, + task_type=task_type, + task_status=task_status + ) - #endregion + async def get_task_info(self, task_id): + return await self.task_manager.get_task_info(task_id) #region Annotations and Review @@ -346,3 +218,5 @@ async def reject_url( user_id=access_info.user_id ) + + diff --git a/core/TaskManager.py b/core/TaskManager.py new file mode 100644 index 00000000..003fda0f --- /dev/null +++ b/core/TaskManager.py @@ -0,0 +1,177 @@ +import logging + +from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.TaskInfo import TaskInfo +from collector_db.enums import TaskType +from core.DTOs.GetTasksResponse import GetTasksResponse +from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome +from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator +from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator +from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from core.enums import BatchStatus +from html_tag_collector.ResponseParser import HTMLResponseParser +from html_tag_collector.URLRequestInterface import URLRequestInterface +from hugging_face.HuggingFaceInterface import HuggingFaceInterface +from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.PDAPClient import PDAPClient +from util.DiscordNotifier import DiscordPoster +from util.helper_functions import get_from_env + +TASK_REPEAT_THRESHOLD = 20 + +class TaskManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + huggingface_interface: HuggingFaceInterface, + url_request_interface: URLRequestInterface, + html_parser: HTMLResponseParser, + discord_poster: DiscordPoster, + ): + self.adb_client = adb_client + self.huggingface_interface = huggingface_interface + self.url_request_interface = url_request_interface + self.html_parser = html_parser + self.discord_poster = discord_poster + self.logger = logging.getLogger(__name__) + self.logger.addHandler(logging.StreamHandler()) + self.logger.setLevel(logging.INFO) + + + + #region Task Operators + async def get_url_html_task_operator(self): + self.logger.info("Running URL HTML Task") + operator = URLHTMLTaskOperator( + adb_client=self.adb_client, + url_request_interface=self.url_request_interface, + html_parser=self.html_parser + ) + return operator + + async def get_url_relevance_huggingface_task_operator(self): + self.logger.info("Running URL Relevance Huggingface Task") + operator = URLRelevanceHuggingfaceTaskOperator( + adb_client=self.adb_client, + huggingface_interface=self.huggingface_interface + ) + return operator + + async def get_url_record_type_task_operator(self): + operator = URLRecordTypeTaskOperator( + adb_client=self.adb_client, + classifier=OpenAIRecordClassifier() + ) + return operator + + async def get_agency_identification_task_operator(self): + pdap_client = PDAPClient( + access_manager=AccessManager( + email=get_from_env("PDAP_EMAIL"), + password=get_from_env("PDAP_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY"), + ), + ) + muckrock_api_interface = MuckrockAPIInterface() + operator = AgencyIdentificationTaskOperator( + adb_client=self.adb_client, + pdap_client=pdap_client, + muckrock_api_interface=muckrock_api_interface + ) + return operator + + async def get_url_miscellaneous_metadata_task_operator(self): + operator = URLMiscellaneousMetadataTaskOperator( + adb_client=self.adb_client + ) + return operator + + async def get_task_operators(self) -> list[TaskOperatorBase]: + return [ + await self.get_url_html_task_operator(), + await self.get_url_relevance_huggingface_task_operator(), + await self.get_url_record_type_task_operator(), + await self.get_agency_identification_task_operator(), + await self.get_url_miscellaneous_metadata_task_operator() + ] + + #endregion + + #region Tasks + async def run_tasks(self): + operators = await self.get_task_operators() + count = 0 + for operator in operators: + + meets_prereq = await operator.meets_task_prerequisites() + while meets_prereq: + if count > TASK_REPEAT_THRESHOLD: + self.discord_poster.post_to_discord( + message=f"Task {operator.task_type.value} has been run" + f" more than {TASK_REPEAT_THRESHOLD} times in a row. " + f"Task loop terminated.") + break + task_id = await self.initiate_task_in_db(task_type=operator.task_type) + run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + await self.conclude_task(run_info) + count += 1 + meets_prereq = await operator.meets_task_prerequisites() + + + async def conclude_task(self, run_info): + await self.adb_client.link_urls_to_task( + task_id=run_info.task_id, + url_ids=run_info.linked_url_ids + ) + await self.handle_outcome(run_info) + + async def initiate_task_in_db(self, task_type: TaskType) -> int: + self.logger.info(f"Initiating {task_type.value} Task") + task_id = await self.adb_client.initiate_task(task_type=task_type) + return task_id + + async def handle_outcome(self, run_info: TaskOperatorRunInfo): + match run_info.outcome: + case TaskOperatorOutcome.ERROR: + await self.handle_task_error(run_info) + case TaskOperatorOutcome.SUCCESS: + await self.adb_client.update_task_status( + task_id=run_info.task_id, + status=BatchStatus.COMPLETE + ) + + async def handle_task_error(self, run_info: TaskOperatorRunInfo): + await self.adb_client.update_task_status( + task_id=run_info.task_id, + status=BatchStatus.ERROR) + await self.adb_client.add_task_error( + task_id=run_info.task_id, + error=run_info.message + ) + + async def get_task_info(self, task_id: int) -> TaskInfo: + return await self.adb_client.get_task_info(task_id=task_id) + + async def get_tasks( + self, + page: int, + task_type: TaskType, + task_status: BatchStatus + ) -> GetTasksResponse: + return await self.adb_client.get_tasks( + page=page, + task_type=task_type, + task_status=task_status + ) + + + #endregion + + + diff --git a/tests/conftest.py b/tests/conftest.py index fbe5dd50..8aeb6dc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -66,7 +66,7 @@ def db_client_test(wipe_database) -> DatabaseClient: @pytest.fixture def adb_client_test(wipe_database) -> AsyncDatabaseClient: - conn = get_postgres_connection_string() + conn = get_postgres_connection_string(is_async=True) adb_client = AsyncDatabaseClient(db_url=conn) yield adb_client adb_client.engine.dispose() diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 6964fb86..3cbdb11b 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -190,10 +190,10 @@ def urls( ) ) - return asyncio.run(self.adb_client.insert_urls( + return self.db_client.insert_urls( url_infos=url_infos, batch_id=batch_id, - )) + ) async def url_miscellaneous_metadata( self, diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 81207a28..1a142651 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -60,6 +60,8 @@ def test_example_collector(api_test_helper): # Flush early to ensure logs are written ath.core.core_logger.flush_all() + time.sleep(10) + lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) assert len(lr.logs) > 0 diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index 8ffdc266..cd05cf6f 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -31,10 +31,7 @@ def test_async_core(db_client_test): adb_client = AsyncDatabaseClient() core = AsyncCore( adb_client=adb_client, - huggingface_interface=MagicMock(), - url_request_interface=MagicMock(), - html_parser=MagicMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=AsyncCollectorManager( adb_client=adb_client, logger=logger, diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index 3fe10580..de1b9b85 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -25,10 +25,8 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): core = AsyncCore( adb_client=ddc.adb_client, - huggingface_interface=MagicMock(), - url_request_interface=MagicMock(), - html_parser=MagicMock(), - discord_poster=MagicMock() + task_manager=MagicMock(), + collector_manager=MagicMock() ) await core.conclude_task(run_info=run_info) @@ -52,13 +50,10 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): core = AsyncCore( adb_client=ddc.adb_client, - huggingface_interface=MagicMock(), - url_request_interface=MagicMock(), - html_parser=MagicMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=MagicMock() ) - await core.conclude_task(run_info=run_info) + await core.task_manager.conclude_task(run_info=run_info) task_info = await ddc.adb_client.get_task_info(task_id=task_id) @@ -81,13 +76,10 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): core = AsyncCore( adb_client=ddc.adb_client, - huggingface_interface=MagicMock(), - url_request_interface=MagicMock(), - html_parser=MagicMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=MagicMock() ) - await core.conclude_task(run_info=run_info) + await core.task_manager.conclude_task(run_info=run_info) task_info = await ddc.adb_client.get_task_info(task_id=task_id) @@ -99,10 +91,7 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): async def test_run_task_prereq_not_met(): core = AsyncCore( adb_client=AsyncMock(), - huggingface_interface=AsyncMock(), - url_request_interface=AsyncMock(), - html_parser=AsyncMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=MagicMock() ) @@ -126,10 +115,7 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: core = AsyncCore( adb_client=db_data_creator.adb_client, - huggingface_interface=AsyncMock(), - url_request_interface=AsyncMock(), - html_parser=AsyncMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=MagicMock() ) core.conclude_task = AsyncMock() @@ -171,10 +157,7 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: core = AsyncCore( adb_client=db_data_creator.adb_client, - huggingface_interface=AsyncMock(), - url_request_interface=AsyncMock(), - html_parser=AsyncMock(), - discord_poster=MagicMock(), + task_manager=MagicMock(), collector_manager=MagicMock() ) core.conclude_task = AsyncMock() From 6c3fe10e168b593d53c703ceafa509b6f763b186 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 15:35:45 -0400 Subject: [PATCH 097/182] feat(app): make collectors asynchronouns and add task trigger Collectors have now been designed to be asynchronous, rather than existing in separate threads. In addition, collectors are now set up to trigger tasks immediately after collection completion, in addition to occurring periodically. --- api/main.py | 34 +++++---- collector_manager/AsyncCollectorBase.py | 11 +-- collector_manager/AsyncCollectorManager.py | 10 ++- core/AsyncCore.py | 2 +- core/FunctionTrigger.py | 30 ++++++++ core/TaskManager.py | 5 ++ .../integration/api/conftest.py | 42 +++++++++-- .../integration/api/test_duplicates.py | 5 ++ .../integration/api/test_example_collector.py | 10 +++ .../integration/core/test_async_core.py | 75 +++++++++---------- .../core/test_example_collector_lifecycle.py | 1 - .../unit/test_function_trigger.py | 67 +++++++++++++++++ 12 files changed, 223 insertions(+), 69 deletions(-) create mode 100644 core/FunctionTrigger.py create mode 100644 tests/test_automated/unit/test_function_trigger.py diff --git a/api/main.py b/api/main.py index cc7e3fa2..79e31542 100644 --- a/api/main.py +++ b/api/main.py @@ -34,29 +34,33 @@ async def lifespan(app: FastAPI): adb_client = AsyncDatabaseClient() await setup_database(db_client) core_logger = CoreLogger(db_client=db_client) - async_collector_manager = AsyncCollectorManager( - logger=core_logger, - adb_client=adb_client, - ) + source_collector_core = SourceCollectorCore( core_logger=CoreLogger( db_client=db_client ), db_client=DatabaseClient(), ) - async_core = AsyncCore( + task_manager = TaskManager( adb_client=adb_client, - task_manager=TaskManager( - adb_client=adb_client, - huggingface_interface=HuggingFaceInterface(), - url_request_interface=URLRequestInterface(), - html_parser=HTMLResponseParser( - root_url_cache=RootURLCache() - ), - discord_poster=DiscordPoster( - webhook_url=get_from_env("DISCORD_WEBHOOK_URL") - ), + huggingface_interface=HuggingFaceInterface(), + url_request_interface=URLRequestInterface(), + html_parser=HTMLResponseParser( + root_url_cache=RootURLCache() ), + discord_poster=DiscordPoster( + webhook_url=get_from_env("DISCORD_WEBHOOK_URL") + ) + ) + async_collector_manager = AsyncCollectorManager( + logger=core_logger, + adb_client=adb_client, + post_collection_function_trigger=task_manager.task_trigger + ) + + async_core = AsyncCore( + adb_client=adb_client, + task_manager=task_manager, collector_manager=async_collector_manager ) async_scheduled_task_manager = AsyncScheduledTaskManager(async_core=async_core) diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index e93f97fc..ec53f4c6 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -11,7 +11,7 @@ from collector_db.DTOs.LogInfo import LogInfo from collector_manager.enums import CollectorType from core.CoreLogger import CoreLogger -from core.TaskManager import TaskManager +from core.FunctionTrigger import FunctionTrigger from core.enums import BatchStatus from core.preprocessors.PreprocessorBase import PreprocessorBase @@ -28,11 +28,9 @@ def __init__( logger: CoreLogger, adb_client: AsyncDatabaseClient, raise_error: bool = False, - trigger_followup_tasks: bool = False, - task_manager: TaskManager = None + post_collection_function_trigger: Optional[FunctionTrigger] = None, ) -> None: - self.trigger_followup_tasks = trigger_followup_tasks - self.task_manager = task_manager + self.post_collection_function_trigger = post_collection_function_trigger self.batch_id = batch_id self.adb_client = adb_client self.dto = dto @@ -95,6 +93,9 @@ async def process(self) -> None: ) await self.log("Done processing collector.", allow_abort=False) + if self.post_collection_function_trigger is not None: + await self.post_collection_function_trigger.trigger_or_rerun() + async def run(self) -> None: try: await self.start_timer() diff --git a/collector_manager/AsyncCollectorManager.py b/collector_manager/AsyncCollectorManager.py index af875ddc..bf338c88 100644 --- a/collector_manager/AsyncCollectorManager.py +++ b/collector_manager/AsyncCollectorManager.py @@ -11,6 +11,7 @@ from collector_manager.collector_mapping import COLLECTOR_MAPPING from collector_manager.enums import CollectorType from core.CoreLogger import CoreLogger +from core.FunctionTrigger import FunctionTrigger class AsyncCollectorManager: @@ -19,13 +20,15 @@ def __init__( self, logger: CoreLogger, adb_client: AsyncDatabaseClient, - dev_mode: bool = False + dev_mode: bool = False, + post_collection_function_trigger: FunctionTrigger = None ): self.collectors: Dict[int, AsyncCollectorBase] = {} self.adb_client = adb_client self.logger = logger self.async_tasks: dict[int, asyncio.Task] = {} self.dev_mode = dev_mode + self.post_collection_function_trigger = post_collection_function_trigger async def has_collector(self, cid: int) -> bool: return cid in self.collectors @@ -34,7 +37,7 @@ async def start_async_collector( self, collector_type: CollectorType, batch_id: int, - dto: BaseModel + dto: BaseModel, ) -> None: if batch_id in self.collectors: raise ValueError(f"Collector with batch_id {batch_id} is already running.") @@ -45,7 +48,8 @@ async def start_async_collector( dto=dto, logger=self.logger, adb_client=self.adb_client, - raise_error=True if self.dev_mode else False + raise_error=True if self.dev_mode else False, + post_collection_function_trigger=self.post_collection_function_trigger ) except KeyError: raise InvalidCollectorError(f"Collector {collector_type.value} not found.") diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 971cd03d..b17903db 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -89,7 +89,7 @@ async def initiate_collector( # endregion async def run_tasks(self): - await self.task_manager.run_tasks() + await self.task_manager.trigger_task_run() async def get_tasks( self, diff --git a/core/FunctionTrigger.py b/core/FunctionTrigger.py new file mode 100644 index 00000000..df85482a --- /dev/null +++ b/core/FunctionTrigger.py @@ -0,0 +1,30 @@ +import asyncio +from typing import Callable, Awaitable + +class FunctionTrigger: + """ + A small class used to trigger a function to run in a loop + If the trigger is used again while the task is running, the task will be rerun + """ + + def __init__(self, func: Callable[[], Awaitable[None]]): + self._func = func + self._lock = asyncio.Lock() + self._rerun_requested = False + self._loop_running = False + + async def trigger_or_rerun(self): + if self._loop_running: + self._rerun_requested = True + return + + async with self._lock: + self._loop_running = True + try: + while True: + self._rerun_requested = False + await self._func() + if not self._rerun_requested: + break + finally: + self._loop_running = False diff --git a/core/TaskManager.py b/core/TaskManager.py index 003fda0f..8ec259f5 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -6,6 +6,7 @@ from collector_db.enums import TaskType from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome +from core.FunctionTrigger import FunctionTrigger from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator @@ -42,6 +43,7 @@ def __init__( self.logger = logging.getLogger(__name__) self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) + self.task_trigger = FunctionTrigger(self.run_tasks) @@ -123,6 +125,9 @@ async def run_tasks(self): count += 1 meets_prereq = await operator.meets_task_prerequisites() + async def trigger_task_run(self): + await self.task_trigger.trigger_or_rerun() + async def conclude_task(self, run_info): await self.adb_client.link_urls_to_task( diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index c2e537b1..e51b05dc 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -1,12 +1,15 @@ import asyncio +import logging +import os from dataclasses import dataclass from typing import Generator -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock import pytest from starlette.testclient import TestClient from api.main import app +from core.AsyncCore import AsyncCore from core.SourceCollectorCore import SourceCollectorCore from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions from tests.helpers.DBDataCreator import DBDataCreator @@ -17,6 +20,7 @@ class APITestHelper: request_validator: RequestValidator core: SourceCollectorCore + async_core: AsyncCore db_data_creator: DBDataCreator mock_huggingface_interface: MagicMock mock_label_studio_interface: MagicMock @@ -26,28 +30,54 @@ def adb_client(self): MOCK_USER_ID = 1 +def disable_task_trigger(ath: APITestHelper) -> None: + ath.async_core.collector_manager.post_collection_function_trigger = AsyncMock() + + + +async def fail_task_trigger() -> None: + raise Exception( + "Task Trigger is set to fail in tests by default, to catch unintentional calls." + "If this is not intended, either replace with a Mock or the expected task function." + ) def override_access_info() -> AccessInfo: return AccessInfo(user_id=MOCK_USER_ID, permissions=[Permissions.SOURCE_COLLECTOR]) -@pytest.fixture -def client(db_client_test, monkeypatch) -> Generator[TestClient, None, None]: - monkeypatch.setenv("DISCORD_WEBHOOK_URL", "https://discord.com") +@pytest.fixture(scope="session") +def client() -> Generator[TestClient, None, None]: + # Mock envioronment + _original_env = dict(os.environ) + os.environ["DISCORD_WEBHOOK_URL"] = "https://discord.com" with TestClient(app) as c: app.dependency_overrides[get_access_info] = override_access_info core: SourceCollectorCore = c.app.state.core + async_core: AsyncCore = c.app.state.async_core + + # Interfaces to the web should be mocked + task_manager = async_core.task_manager + task_manager.huggingface_interface = AsyncMock() + task_manager.url_request_interface = AsyncMock() + task_manager.discord_poster = AsyncMock() + # Disable Logger + task_manager.logger.disabled = True + # Set trigger to fail immediately if called, to force it to be manually specified in tests + task_manager.task_trigger._func = fail_task_trigger # core.shutdown() yield c core.shutdown() + # Reset environment variables back to original state + os.environ.clear() + os.environ.update(_original_env) @pytest.fixture def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> APITestHelper: - return APITestHelper( request_validator=RequestValidator(client=client), core=client.app.state.core, + async_core=client.app.state.async_core, db_data_creator=db_data_creator, mock_huggingface_interface=MagicMock(), mock_label_studio_interface=MagicMock() - ) \ No newline at end of file + ) diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index 292df507..a026d6a1 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -1,12 +1,17 @@ import time +from unittest.mock import AsyncMock from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO +from test_automated.integration.api.conftest import disable_task_trigger def test_duplicates(api_test_helper): ath = api_test_helper + # Temporarily disable task trigger + disable_task_trigger(ath) + dto = ExampleInputDTO( sleep_time=1 ) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 1a142651..2f05d1d5 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -9,11 +9,15 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.enums import BatchStatus +from test_automated.integration.api.conftest import disable_task_trigger def test_example_collector(api_test_helper): ath = api_test_helper + # Temporarily disable task trigger + disable_task_trigger(ath) + dto = ExampleInputDTO( sleep_time=1 ) @@ -66,6 +70,12 @@ def test_example_collector(api_test_helper): assert len(lr.logs) > 0 + # Check that task was triggered + ath.async_core.collector_manager.\ + post_collection_function_trigger.\ + trigger_or_rerun.assert_called_once() + + def test_example_collector_error(api_test_helper, monkeypatch): """ Test that when an error occurs in a collector, the batch is properly update diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index de1b9b85..b4b8e740 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -3,13 +3,28 @@ import pytest +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType from collector_db.models import Task from core.AsyncCore import AsyncCore from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome +from core.TaskManager import TaskManager from core.enums import BatchStatus from tests.helpers.DBDataCreator import DBDataCreator +def setup_async_core(adb_client: AsyncDatabaseClient): + return AsyncCore( + adb_client=adb_client, + task_manager=TaskManager( + adb_client=adb_client, + huggingface_interface=AsyncMock(), + url_request_interface=AsyncMock(), + html_parser=AsyncMock(), + discord_poster=AsyncMock(), + ), + collector_manager=AsyncMock() + ) + @pytest.mark.asyncio async def test_conclude_task_success(db_data_creator: DBDataCreator): ddc = db_data_creator @@ -23,11 +38,7 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): outcome=TaskOperatorOutcome.SUCCESS, ) - core = AsyncCore( - adb_client=ddc.adb_client, - task_manager=MagicMock(), - collector_manager=MagicMock() - ) + core = setup_async_core(db_data_creator.adb_client) await core.conclude_task(run_info=run_info) task_info = await ddc.adb_client.get_task_info(task_id=task_id) @@ -48,11 +59,7 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): outcome=TaskOperatorOutcome.SUCCESS, ) - core = AsyncCore( - adb_client=ddc.adb_client, - task_manager=MagicMock(), - collector_manager=MagicMock() - ) + core = setup_async_core(db_data_creator.adb_client) await core.task_manager.conclude_task(run_info=run_info) task_info = await ddc.adb_client.get_task_info(task_id=task_id) @@ -74,11 +81,7 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): message="test error", ) - core = AsyncCore( - adb_client=ddc.adb_client, - task_manager=MagicMock(), - collector_manager=MagicMock() - ) + core = setup_async_core(db_data_creator.adb_client) await core.task_manager.conclude_task(run_info=run_info) task_info = await ddc.adb_client.get_task_info(task_id=task_id) @@ -89,15 +92,14 @@ async def test_conclude_task_error(db_data_creator: DBDataCreator): @pytest.mark.asyncio async def test_run_task_prereq_not_met(): - core = AsyncCore( - adb_client=AsyncMock(), - task_manager=MagicMock(), - collector_manager=MagicMock() - ) + """ + When a task pre-requisite is not met, the task should not be run + """ + core = setup_async_core(AsyncMock()) mock_operator = AsyncMock() mock_operator.meets_task_prerequisites = AsyncMock(return_value=False) - AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) + core.task_manager.get_task_operators = AsyncMock(return_value=[mock_operator]) await core.run_tasks() mock_operator.meets_task_prerequisites.assert_called_once() @@ -105,6 +107,10 @@ async def test_run_task_prereq_not_met(): @pytest.mark.asyncio async def test_run_task_prereq_met(db_data_creator: DBDataCreator): + """ + When a task pre-requisite is met, the task should be run + And a task entry should be created in the database + """ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( @@ -113,12 +119,8 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: linked_url_ids=[1, 2, 3] ) - core = AsyncCore( - adb_client=db_data_creator.adb_client, - task_manager=MagicMock(), - collector_manager=MagicMock() - ) - core.conclude_task = AsyncMock() + core = setup_async_core(db_data_creator.adb_client) + core.task_manager.conclude_task = AsyncMock() mock_operator = AsyncMock() mock_operator.meets_task_prerequisites = AsyncMock( @@ -127,9 +129,10 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: mock_operator.task_type = TaskType.HTML mock_operator.run_task = types.MethodType(run_task, mock_operator) - AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) + core.task_manager.get_task_operators = AsyncMock(return_value=[mock_operator]) await core.run_tasks() + # There should be two calls to meets_task_prerequisites mock_operator.meets_task_prerequisites.assert_has_calls([call(), call()]) results = await db_data_creator.adb_client.get_all(Task) @@ -137,7 +140,7 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: assert len(results) == 1 assert results[0].task_status == BatchStatus.IN_PROCESS.value - core.conclude_task.assert_called_once() + core.task_manager.conclude_task.assert_called_once() @pytest.mark.asyncio async def test_run_task_break_loop(db_data_creator: DBDataCreator): @@ -155,21 +158,17 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: linked_url_ids=[1, 2, 3] ) - core = AsyncCore( - adb_client=db_data_creator.adb_client, - task_manager=MagicMock(), - collector_manager=MagicMock() - ) - core.conclude_task = AsyncMock() + core = setup_async_core(db_data_creator.adb_client) + core.task_manager.conclude_task = AsyncMock() mock_operator = AsyncMock() mock_operator.meets_task_prerequisites = AsyncMock(return_value=True) mock_operator.task_type = TaskType.HTML mock_operator.run_task = types.MethodType(run_task, mock_operator) - AsyncCore.get_task_operators = AsyncMock(return_value=[mock_operator]) - await core.run_tasks() + core.task_manager.get_task_operators = AsyncMock(return_value=[mock_operator]) + await core.task_manager.trigger_task_run() - core.discord_poster.post_to_discord.assert_called_once_with( + core.task_manager.discord_poster.post_to_discord.assert_called_once_with( message="Task HTML has been run more than 20 times in a row. Task loop terminated." ) diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index 064a93a4..abe8fb7a 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -1,5 +1,4 @@ import asyncio -import time import pytest diff --git a/tests/test_automated/unit/test_function_trigger.py b/tests/test_automated/unit/test_function_trigger.py new file mode 100644 index 00000000..37b3c948 --- /dev/null +++ b/tests/test_automated/unit/test_function_trigger.py @@ -0,0 +1,67 @@ +import asyncio +from collections import deque + +import pytest + +from core.FunctionTrigger import FunctionTrigger + + +@pytest.mark.asyncio +async def test_single_run(): + calls = [] + + async def task_fn(): + calls.append("run") + await asyncio.sleep(0.01) + + trigger = FunctionTrigger(task_fn) + + await trigger.trigger_or_rerun() + + assert calls == ["run"] + +@pytest.mark.asyncio +async def test_rerun_requested(): + call_log = deque() + + async def task_fn(): + call_log.append("start") + await asyncio.sleep(0.01) + call_log.append("end") + + trigger = FunctionTrigger(task_fn) + + # Start first run + task = asyncio.create_task(trigger.trigger_or_rerun()) + + await asyncio.sleep(0.005) # Ensure it's in the middle of first run + await trigger.trigger_or_rerun() # This should request a rerun + + await task + + # One full loop with rerun should call twice + assert list(call_log) == ["start", "end", "start", "end"] + +@pytest.mark.asyncio +async def test_multiple_quick_triggers_only_rerun_once(): + calls = [] + + async def task_fn(): + calls.append("run") + await asyncio.sleep(0.01) + + trigger = FunctionTrigger(task_fn) + + first = asyncio.create_task(trigger.trigger_or_rerun()) + await asyncio.sleep(0.002) + + # These three should all coalesce into one rerun, not three more + await asyncio.gather( + trigger.trigger_or_rerun(), + trigger.trigger_or_rerun(), + trigger.trigger_or_rerun() + ) + + await first + + assert calls == ["run", "run"] \ No newline at end of file From 24173fb405bc67c367204355ba14d0d9f9cab2b0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 15:46:31 -0400 Subject: [PATCH 098/182] fix(app): fix import bug --- api/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/main.py b/api/main.py index 79e31542..93e4521b 100644 --- a/api/main.py +++ b/api/main.py @@ -1,7 +1,6 @@ from contextlib import asynccontextmanager import uvicorn -from adodbapi.ado_consts import adBSTR from fastapi import FastAPI from api.routes.annotate import annotate_router From f001fb84a5c09e478ea5307dd0a1f1ba07e3c8b4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 15:48:57 -0400 Subject: [PATCH 099/182] fix(app): fix import bug --- tests/test_automated/integration/api/test_duplicates.py | 2 +- tests/test_automated/integration/api/test_example_collector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index a026d6a1..c42b894d 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -3,7 +3,7 @@ from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO -from test_automated.integration.api.conftest import disable_task_trigger +from tests.test_automated.integration.api.conftest import disable_task_trigger def test_duplicates(api_test_helper): diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 2f05d1d5..48c86145 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -9,7 +9,7 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.enums import BatchStatus -from test_automated.integration.api.conftest import disable_task_trigger +from tests.test_automated.integration.api.conftest import disable_task_trigger def test_example_collector(api_test_helper): From 0dbb987f6a5dd8cef0507cdcadcdcd2ba89efc56 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 16:00:53 -0400 Subject: [PATCH 100/182] fix(tests): comment out inconsistent test --- .../integration/api/test_example_collector.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 48c86145..acd321c5 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -62,13 +62,14 @@ def test_example_collector(api_test_helper): assert bi.user_id is not None # Flush early to ensure logs are written - ath.core.core_logger.flush_all() - - time.sleep(10) - - lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - - assert len(lr.logs) > 0 + # Commented out due to inconsistency in execution + # ath.core.core_logger.flush_all() + # + # time.sleep(10) + # + # lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) + # + # assert len(lr.logs) > 0 # Check that task was triggered ath.async_core.collector_manager.\ From afe55d70b1c3a3a828ace44261a9f973e77c8826 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 16:05:08 -0400 Subject: [PATCH 101/182] fix(tests): comment out inconsistent test --- .../integration/api/test_example_collector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index acd321c5..c99119e7 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -105,14 +105,14 @@ def test_example_collector_error(api_test_helper, monkeypatch): assert bi.status == BatchStatus.ERROR - - ath.core.core_logger.flush_all() - - time.sleep(10) - - gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - assert gbl.logs[-1].log == "Error: Collector failed!" - - + # + # ath.core.core_logger.flush_all() + # + # time.sleep(10) + # + # gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) + # assert gbl.logs[-1].log == "Error: Collector failed!" + # + # From 72caf70625ba664f3ee6982a507a01c0371c72c4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 18:35:16 -0400 Subject: [PATCH 102/182] feat(app): make logger async --- api/main.py | 14 ++-- collector_db/AsyncDatabaseClient.py | 11 ++- collector_manager/AsyncCollectorBase.py | 12 ++-- collector_manager/AsyncCollectorManager.py | 14 ++-- core/AsyncCoreLogger.py | 71 +++++++++++++++++++ core/SourceCollectorCore.py | 6 +- .../auto_googler/AutoGooglerCollector.py | 2 +- .../muckrock/classes/MuckrockCollector.py | 4 +- .../test_autogoogler_collector.py | 4 +- .../source_collectors/test_ckan_collector.py | 5 +- .../test_common_crawler_collector.py | 3 +- .../test_muckrock_collectors.py | 7 +- .../integration/api/conftest.py | 13 ++-- .../integration/api/test_batch.py | 2 - .../integration/api/test_example_collector.py | 61 ++++++++++------ tests/test_automated/integration/conftest.py | 32 +++++---- .../core/test_example_collector_lifecycle.py | 1 + .../test_autogoogler_collector.py | 3 +- .../source_collectors/test_ckan_collector.py | 3 +- .../test_common_crawl_collector.py | 3 +- .../test_example_collector.py | 7 +- .../test_muckrock_collectors.py | 7 +- 22 files changed, 199 insertions(+), 86 deletions(-) create mode 100644 core/AsyncCoreLogger.py diff --git a/api/main.py b/api/main.py index 93e4521b..19f8de8d 100644 --- a/api/main.py +++ b/api/main.py @@ -14,7 +14,7 @@ from collector_db.DatabaseClient import DatabaseClient from collector_manager.AsyncCollectorManager import AsyncCollectorManager from core.AsyncCore import AsyncCore -from core.CoreLogger import CoreLogger +from core.AsyncCoreLogger import AsyncCoreLogger from core.ScheduledTaskManager import AsyncScheduledTaskManager from core.SourceCollectorCore import SourceCollectorCore from core.TaskManager import TaskManager @@ -32,12 +32,10 @@ async def lifespan(app: FastAPI): db_client = DatabaseClient() adb_client = AsyncDatabaseClient() await setup_database(db_client) - core_logger = CoreLogger(db_client=db_client) + core_logger = AsyncCoreLogger(adb_client=adb_client) + source_collector_core = SourceCollectorCore( - core_logger=CoreLogger( - db_client=db_client - ), db_client=DatabaseClient(), ) task_manager = TaskManager( @@ -68,13 +66,15 @@ async def lifespan(app: FastAPI): app.state.core = source_collector_core app.state.async_core = async_core app.state.async_scheduled_task_manager = async_scheduled_task_manager + app.state.logger = core_logger # Startup logic yield # Code here runs before shutdown # Shutdown logic (if needed) - core_logger.shutdown() - app.state.core.shutdown() + await core_logger.shutdown() + await async_core.shutdown() + source_collector_core.shutdown() # Clean up resources, close connections, etc. pass diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 60fdcdfe..c8315fbe 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -14,6 +14,7 @@ from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo +from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType @@ -27,7 +28,7 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate + UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log from collector_manager.enums import URLStatus, CollectorType from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo @@ -1378,6 +1379,14 @@ async def get_url_info_by_url(self, session: AsyncSession, url: str) -> Optional url = raw_result.scalars().first() return URLInfo(**url.__dict__) + @session_manager + async def insert_logs(self, session, log_infos: List[LogInfo]): + for log_info in log_infos: + log = Log(log=log_info.log, batch_id=log_info.batch_id) + if log_info.created_at is not None: + log.created_at = log_info.created_at + session.add(log) + @session_manager async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]): for duplicate_info in duplicate_infos: diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index ec53f4c6..fe260266 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -10,7 +10,7 @@ from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.LogInfo import LogInfo from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger +from core.AsyncCoreLogger import AsyncCoreLogger from core.FunctionTrigger import FunctionTrigger from core.enums import BatchStatus from core.preprocessors.PreprocessorBase import PreprocessorBase @@ -25,7 +25,7 @@ def __init__( self, batch_id: int, dto: BaseModel, - logger: CoreLogger, + logger: AsyncCoreLogger, adb_client: AsyncDatabaseClient, raise_error: bool = False, post_collection_function_trigger: Optional[FunctionTrigger] = None, @@ -120,8 +120,12 @@ async def run(self) -> None: self.status = BatchStatus.ERROR await self.handle_error(e) - async def log(self, message: str, allow_abort = True) -> None: - self.logger.log(LogInfo( + async def log( + self, + message: str, + allow_abort = True # Deprecated + ) -> None: + await self.logger.log(LogInfo( batch_id=self.batch_id, log=message )) diff --git a/collector_manager/AsyncCollectorManager.py b/collector_manager/AsyncCollectorManager.py index bf338c88..1851bfc9 100644 --- a/collector_manager/AsyncCollectorManager.py +++ b/collector_manager/AsyncCollectorManager.py @@ -10,7 +10,7 @@ from collector_manager.CollectorManager import InvalidCollectorError from collector_manager.collector_mapping import COLLECTOR_MAPPING from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger +from core.AsyncCoreLogger import AsyncCoreLogger from core.FunctionTrigger import FunctionTrigger @@ -18,7 +18,7 @@ class AsyncCollectorManager: def __init__( self, - logger: CoreLogger, + logger: AsyncCoreLogger, adb_client: AsyncDatabaseClient, dev_mode: bool = False, post_collection_function_trigger: FunctionTrigger = None @@ -79,10 +79,16 @@ async def abort_collector_async(self, cid: int) -> None: self.async_tasks.pop(cid) async def shutdown_all_collectors(self) -> None: - for cid, task in self.async_tasks.items(): + while self.async_tasks: + cid, task = self.async_tasks.popitem() if task.done(): try: task.result() except Exception as e: raise e - await self.abort_collector_async(cid) \ No newline at end of file + else: + task.cancel() + try: + await task # Await so cancellation propagates + except asyncio.CancelledError: + pass \ No newline at end of file diff --git a/core/AsyncCoreLogger.py b/core/AsyncCoreLogger.py new file mode 100644 index 00000000..70ca06aa --- /dev/null +++ b/core/AsyncCoreLogger.py @@ -0,0 +1,71 @@ +import asyncio + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.DTOs.LogInfo import LogInfo + + +class AsyncCoreLogger: + def __init__( + self, + adb_client: AsyncDatabaseClient, + flush_interval: float = 10, + batch_size: int = 100 + ): + self.adb_client = adb_client + self.flush_interval = flush_interval + self.batch_size = batch_size + + self.log_queue = asyncio.Queue() + self.lock = asyncio.Lock() + self._flush_task: asyncio.Task | None = None + self._stop_event = asyncio.Event() + + async def __aenter__(self): + self._stop_event.clear() + self._flush_task = asyncio.create_task(self._flush_logs()) + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.shutdown() + + async def log(self, log_info: LogInfo): + await self.log_queue.put(log_info) + + async def _flush_logs(self): + while not self._stop_event.is_set(): + await asyncio.sleep(self.flush_interval) + await self.flush() + + async def flush(self): + async with self.lock: + logs: list[LogInfo] = [] + + while not self.log_queue.empty() and len(logs) < self.batch_size: + try: + log = self.log_queue.get_nowait() + logs.append(log) + except asyncio.QueueEmpty: + break + + if logs: + await self.adb_client.insert_logs(log_infos=logs) + + async def clear_log_queue(self): + while not self.log_queue.empty(): + self.log_queue.get_nowait() + + async def flush_all(self): + while not self.log_queue.empty(): + await self.flush() + + async def restart(self): + await self.flush_all() + await self.shutdown() + self._stop_event.clear() + self._flush_task = asyncio.create_task(self._flush_logs()) + + async def shutdown(self): + self._stop_event.set() + if self._flush_task: + await self._flush_task + await self.flush_all() diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index a0bb34fc..8002717c 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -3,7 +3,6 @@ from collector_db.DatabaseClient import DatabaseClient from collector_manager.enums import CollectorType -from core.CoreLogger import CoreLogger from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse @@ -14,13 +13,12 @@ class SourceCollectorCore: def __init__( self, - core_logger: CoreLogger, - collector_manager: Optional[Any] = None, + core_logger: Optional[Any] = None, # Deprecated + collector_manager: Optional[Any] = None, # Deprecated db_client: DatabaseClient = DatabaseClient(), dev_mode: bool = False ): self.db_client = db_client - self.core_logger = core_logger if not dev_mode: self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) else: diff --git a/source_collectors/auto_googler/AutoGooglerCollector.py b/source_collectors/auto_googler/AutoGooglerCollector.py index b678f066..1748d911 100644 --- a/source_collectors/auto_googler/AutoGooglerCollector.py +++ b/source_collectors/auto_googler/AutoGooglerCollector.py @@ -27,7 +27,7 @@ async def run_to_completion(self) -> AutoGoogler: ) ) async for log in auto_googler.run(): - self.log(log) + await self.log(log) return auto_googler async def run_implementation(self) -> None: diff --git a/source_collectors/muckrock/classes/MuckrockCollector.py b/source_collectors/muckrock/classes/MuckrockCollector.py index 885c0369..0511a21d 100644 --- a/source_collectors/muckrock/classes/MuckrockCollector.py +++ b/source_collectors/muckrock/classes/MuckrockCollector.py @@ -47,9 +47,9 @@ async def run_implementation(self) -> None: self.check_for_count_break(results_count, max_count) except SearchCompleteException: break - self.log(f"Search {search_count}: Found {len(results)} results") + await self.log(f"Search {search_count}: Found {len(results)} results") - self.log(f"Search Complete. Total results: {results_count}") + await self.log(f"Search Complete. Total results: {results_count}") self.data = {"urls": self.format_results(all_results)} def format_results(self, results: list[dict]) -> list[dict]: diff --git a/tests/manual/source_collectors/test_autogoogler_collector.py b/tests/manual/source_collectors/test_autogoogler_collector.py index 78fc46d7..a51fc883 100644 --- a/tests/manual/source_collectors/test_autogoogler_collector.py +++ b/tests/manual/source_collectors/test_autogoogler_collector.py @@ -3,7 +3,7 @@ import pytest from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.auto_googler.AutoGooglerCollector import AutoGooglerCollector from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO @@ -16,7 +16,7 @@ async def test_autogoogler_collector(): urls_per_result=5, queries=["police"], ), - logger = MagicMock(spec=CoreLogger), + logger = AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index 3bae5d88..f642fd8d 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector from source_collectors.ckan.DTOs import CKANInputDTO @@ -31,7 +32,7 @@ async def test_ckan_collector_default(): "organization_search": organization_search } ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) @@ -80,7 +81,7 @@ async def test_ckan_collector_custom(): ] } ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index 6c9771f3..872b7710 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.common_crawler.CommonCrawlerCollector import CommonCrawlerCollector from source_collectors.common_crawler.DTOs import CommonCrawlerInputDTO @@ -18,7 +19,7 @@ async def test_common_crawler_collector(): collector = CommonCrawlerCollector( batch_id=1, dto=CommonCrawlerInputDTO(), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index 8fb80bc4..bfd0ba26 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -3,6 +3,7 @@ import pytest from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.muckrock.DTOs import MuckrockSimpleSearchCollectorInputDTO, \ MuckrockCountySearchCollectorInputDTO, MuckrockAllFOIARequestsCollectorInputDTO @@ -22,7 +23,7 @@ async def test_muckrock_simple_search_collector(): search_string="police", max_results=10 ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) @@ -41,7 +42,7 @@ async def test_muckrock_county_level_search_collector(): parent_jurisdiction_id=ALLEGHENY_COUNTY_MUCKROCK_ID, town_names=ALLEGHENY_COUNTY_TOWN_NAMES ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) @@ -61,7 +62,7 @@ async def test_muckrock_full_search_collector(): start_page=1, total_pages=2 ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index e51b05dc..b466bfbb 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -6,10 +6,12 @@ from unittest.mock import MagicMock, AsyncMock import pytest +import pytest_asyncio from starlette.testclient import TestClient from api.main import app from core.AsyncCore import AsyncCore +from core.AsyncCoreLogger import AsyncCoreLogger from core.SourceCollectorCore import SourceCollectorCore from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions from tests.helpers.DBDataCreator import DBDataCreator @@ -51,7 +53,6 @@ def client() -> Generator[TestClient, None, None]: os.environ["DISCORD_WEBHOOK_URL"] = "https://discord.com" with TestClient(app) as c: app.dependency_overrides[get_access_info] = override_access_info - core: SourceCollectorCore = c.app.state.core async_core: AsyncCore = c.app.state.async_core # Interfaces to the web should be mocked @@ -63,17 +64,16 @@ def client() -> Generator[TestClient, None, None]: task_manager.logger.disabled = True # Set trigger to fail immediately if called, to force it to be manually specified in tests task_manager.task_trigger._func = fail_task_trigger - # core.shutdown() yield c - core.shutdown() + # Reset environment variables back to original state os.environ.clear() os.environ.update(_original_env) -@pytest.fixture -def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> APITestHelper: - return APITestHelper( +@pytest_asyncio.fixture +async def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> APITestHelper: + yield APITestHelper( request_validator=RequestValidator(client=client), core=client.app.state.core, async_core=client.app.state.async_core, @@ -81,3 +81,4 @@ def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> APITest mock_huggingface_interface=MagicMock(), mock_label_studio_interface=MagicMock() ) + await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/test_automated/integration/api/test_batch.py b/tests/test_automated/integration/api/test_batch.py index 69c2fcab..604e2d67 100644 --- a/tests/test_automated/integration/api/test_batch.py +++ b/tests/test_automated/integration/api/test_batch.py @@ -20,8 +20,6 @@ def test_abort_batch(api_test_helper): assert response.message == "Batch aborted." - time.sleep(3) - bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) assert bi.status == BatchStatus.ABORTED diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index c99119e7..a235d8e8 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -1,10 +1,15 @@ +import asyncio import time -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock +import pytest + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.ExampleCollector import ExampleCollector from collector_manager.enums import CollectorType +from core.AsyncCoreLogger import AsyncCoreLogger from core.DTOs.BatchStatusInfo import BatchStatusInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse @@ -12,12 +17,17 @@ from tests.test_automated.integration.api.conftest import disable_task_trigger -def test_example_collector(api_test_helper): +@pytest.mark.asyncio +async def test_example_collector(api_test_helper): ath = api_test_helper # Temporarily disable task trigger disable_task_trigger(ath) + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient()) + await logger.__aenter__() + ath.async_core.collector_manager.logger = logger + dto = ExampleInputDTO( sleep_time=1 ) @@ -40,7 +50,7 @@ def test_example_collector(api_test_helper): assert bsi.strategy == CollectorType.EXAMPLE.value assert bsi.status == BatchStatus.IN_PROCESS - time.sleep(2) + await asyncio.sleep(2) csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, @@ -62,29 +72,33 @@ def test_example_collector(api_test_helper): assert bi.user_id is not None # Flush early to ensure logs are written - # Commented out due to inconsistency in execution - # ath.core.core_logger.flush_all() - # - # time.sleep(10) - # - # lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - # - # assert len(lr.logs) > 0 + await logger.flush_all() + + + lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) + + assert len(lr.logs) > 0 # Check that task was triggered ath.async_core.collector_manager.\ post_collection_function_trigger.\ trigger_or_rerun.assert_called_once() + await logger.__aexit__(None, None, None) -def test_example_collector_error(api_test_helper, monkeypatch): +@pytest.mark.asyncio +async def test_example_collector_error(api_test_helper, monkeypatch): """ Test that when an error occurs in a collector, the batch is properly update """ ath = api_test_helper + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient()) + await logger.__aenter__() + ath.async_core.collector_manager.logger = logger + # Patch the collector to raise an exception during run_implementation - mock = MagicMock() + mock = AsyncMock() mock.side_effect = Exception("Collector failed!") monkeypatch.setattr(ExampleCollector, 'run_implementation', mock) @@ -99,20 +113,21 @@ def test_example_collector_error(api_test_helper, monkeypatch): assert batch_id is not None assert data["message"] == "Started example collector." - time.sleep(1) + await asyncio.sleep(1) bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) assert bi.status == BatchStatus.ERROR - # - # ath.core.core_logger.flush_all() - # - # time.sleep(10) - # - # gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - # assert gbl.logs[-1].log == "Error: Collector failed!" - # - # + # Check there are logs + assert not logger.log_queue.empty() + await logger.flush_all() + assert logger.log_queue.empty() + + gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) + assert gbl.logs[-1].log == "Error: Collector failed!" + await logger.__aexit__(None, None, None) + + diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index cd05cf6f..6be03e86 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_manager.AsyncCollectorManager import AsyncCollectorManager from core.AsyncCore import AsyncCore +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from core.SourceCollectorCore import SourceCollectorCore @@ -24,19 +25,20 @@ def test_core(db_client_test): @pytest.fixture -def test_async_core(db_client_test): - with CoreLogger( - db_client=db_client_test - ) as logger: - adb_client = AsyncDatabaseClient() - core = AsyncCore( +def test_async_core(adb_client_test): + logger = AsyncCoreLogger( + adb_client=adb_client_test + ) + adb_client = AsyncDatabaseClient() + core = AsyncCore( + adb_client=adb_client, + task_manager=MagicMock(), + collector_manager=AsyncCollectorManager( adb_client=adb_client, - task_manager=MagicMock(), - collector_manager=AsyncCollectorManager( - adb_client=adb_client, - logger=logger, - dev_mode=True - ), - ) - yield core - core.shutdown() \ No newline at end of file + logger=logger, + dev_mode=True + ), + ) + yield core + core.shutdown() + logger.shutdown() \ No newline at end of file diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index abe8fb7a..d3f3f855 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -39,6 +39,7 @@ async def test_example_collector_lifecycle( assert core.get_status(batch_id) == BatchStatus.IN_PROCESS print("Sleeping for 1.5 seconds...") await asyncio.sleep(1.5) + await acore.collector_manager.logger.flush_all() print("Done sleeping...") assert core.get_status(batch_id) == BatchStatus.COMPLETE diff --git a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py index 050b1299..2349afe2 100644 --- a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.auto_googler.AutoGooglerCollector import AutoGooglerCollector from source_collectors.auto_googler.DTOs import GoogleSearchQueryResultsInnerDTO, AutoGooglerInputDTO @@ -29,7 +30,7 @@ async def test_auto_googler_collector(patch_get_query_results): dto=AutoGooglerInputDTO( queries=["keyword"] ), - logger=AsyncMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/test_automated/unit/source_collectors/test_ckan_collector.py b/tests/test_automated/unit/source_collectors/test_ckan_collector.py index b00ed434..ef7dbee8 100644 --- a/tests/test_automated/unit/source_collectors/test_ckan_collector.py +++ b/tests/test_automated/unit/source_collectors/test_ckan_collector.py @@ -6,6 +6,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector from source_collectors.ckan.DTOs import CKANInputDTO @@ -42,7 +43,7 @@ async def test_ckan_collector(mock_ckan_collector_methods): collector = CKANCollector( batch_id=1, dto=CKANInputDTO(), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py index 74fe1052..d1f0ccda 100644 --- a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py @@ -5,6 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.common_crawler.CommonCrawlerCollector import CommonCrawlerCollector from source_collectors.common_crawler.DTOs import CommonCrawlerInputDTO @@ -31,7 +32,7 @@ async def test_common_crawl_collector(mock_get_common_crawl_search_results): dto=CommonCrawlerInputDTO( search_term="keyword", ), - logger=mock.MagicMock(spec=CoreLogger), + logger=mock.AsyncMock(spec=AsyncCoreLogger), adb_client=mock.AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) diff --git a/tests/test_automated/unit/source_collectors/test_example_collector.py b/tests/test_automated/unit/source_collectors/test_example_collector.py index 17512a6f..26ca601d 100644 --- a/tests/test_automated/unit/source_collectors/test_example_collector.py +++ b/tests/test_automated/unit/source_collectors/test_example_collector.py @@ -1,8 +1,9 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock from collector_db.DatabaseClient import DatabaseClient from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.ExampleCollector import ExampleCollector +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger @@ -12,8 +13,8 @@ def test_example_collector(): dto=ExampleInputDTO( sleep_time=1 ), - logger=MagicMock(spec=CoreLogger), - adb_client=MagicMock(spec=DatabaseClient), + logger=AsyncMock(spec=AsyncCoreLogger), + adb_client=AsyncMock(spec=DatabaseClient), raise_error=True ) collector.run() \ No newline at end of file diff --git a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py index f74c651e..7e533efa 100644 --- a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,6 +6,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from collector_db.DatabaseClient import DatabaseClient +from core.AsyncCoreLogger import AsyncCoreLogger from core.CoreLogger import CoreLogger from source_collectors.muckrock.DTOs import MuckrockSimpleSearchCollectorInputDTO, \ MuckrockCountySearchCollectorInputDTO, MuckrockAllFOIARequestsCollectorInputDTO @@ -40,7 +41,7 @@ async def test_muckrock_simple_collector(patch_muckrock_fetcher): search_string="keyword", max_results=2 ), - logger=mock.MagicMock(spec=CoreLogger), + logger=mock.AsyncMock(spec=AsyncCoreLogger), adb_client=mock.AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) @@ -97,7 +98,7 @@ async def test_muckrock_county_search_collector(patch_muckrock_county_level_sear parent_jurisdiction_id=1, town_names=["test"] ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) @@ -166,7 +167,7 @@ async def test_muckrock_all_foia_requests_collector(patch_muckrock_full_search_c start_page=1, total_pages=2 ), - logger=MagicMock(spec=CoreLogger), + logger=AsyncMock(spec=AsyncCoreLogger), adb_client=AsyncMock(spec=AsyncDatabaseClient), raise_error=True ) From 5b3658fd1e7092a250d89a5ce186c9e1d6f13084 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 19:26:05 -0400 Subject: [PATCH 103/182] feat(app): add task status endpoint --- api/routes/task.py | 11 ++++++++++- collector_db/DTOs/GetTaskStatusResponseInfo.py | 7 +++++++ collector_db/enums.py | 1 + core/AsyncCore.py | 3 +++ core/TaskManager.py | 10 ++++++++-- core/classes/URLHTMLTaskOperator.py | 1 - .../integration/api/helpers/RequestValidator.py | 9 ++++++++- tests/test_automated/integration/api/test_task.py | 14 ++++++++++++++ 8 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 collector_db/DTOs/GetTaskStatusResponseInfo.py diff --git a/api/routes/task.py b/api/routes/task.py index d9cdbeac..44971959 100644 --- a/api/routes/task.py +++ b/api/routes/task.py @@ -3,6 +3,7 @@ from fastapi import APIRouter, Depends, Query, Path from api.dependencies import get_async_core +from collector_db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType from core.AsyncCore import AsyncCore @@ -39,6 +40,12 @@ async def get_tasks( task_status=task_status ) +@task_router.get("/status") +async def get_task_status( + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +) -> GetTaskStatusResponseInfo: + return await async_core.get_current_task_status() @task_router.get("/{task_id}") async def get_task_info( @@ -46,4 +53,6 @@ async def get_task_info( async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) ) -> TaskInfo: - return await async_core.get_task_info(task_id) \ No newline at end of file + return await async_core.get_task_info(task_id) + + diff --git a/collector_db/DTOs/GetTaskStatusResponseInfo.py b/collector_db/DTOs/GetTaskStatusResponseInfo.py new file mode 100644 index 00000000..f6a8d5fc --- /dev/null +++ b/collector_db/DTOs/GetTaskStatusResponseInfo.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + +from collector_db.enums import TaskType + + +class GetTaskStatusResponseInfo(BaseModel): + status: TaskType \ No newline at end of file diff --git a/collector_db/enums.py b/collector_db/enums.py index 0dd956c5..c12cfde0 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -38,6 +38,7 @@ class TaskType(PyEnum): RECORD_TYPE = "Record Type" AGENCY_IDENTIFICATION = "Agency Identification" MISC_METADATA = "Misc Metadata" + IDLE = "Idle" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/core/AsyncCore.py b/core/AsyncCore.py index b17903db..299a865e 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -4,6 +4,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo +from collector_db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo from collector_db.enums import TaskType from collector_manager.AsyncCollectorManager import AsyncCollectorManager from collector_manager.enums import CollectorType @@ -87,6 +88,8 @@ async def initiate_collector( ) # endregion + async def get_current_task_status(self) -> GetTaskStatusResponseInfo: + return GetTaskStatusResponseInfo(status=self.task_manager.task_status) async def run_tasks(self): await self.task_manager.trigger_task_run() diff --git a/core/TaskManager.py b/core/TaskManager.py index 8ec259f5..64aa57e6 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient @@ -44,12 +45,12 @@ def __init__( self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) self.task_trigger = FunctionTrigger(self.run_tasks) + self.task_status: TaskType = TaskType.IDLE #region Task Operators async def get_url_html_task_operator(self): - self.logger.info("Running URL HTML Task") operator = URLHTMLTaskOperator( adb_client=self.adb_client, url_request_interface=self.url_request_interface, @@ -58,7 +59,6 @@ async def get_url_html_task_operator(self): return operator async def get_url_relevance_huggingface_task_operator(self): - self.logger.info("Running URL Relevance Huggingface Task") operator = URLRelevanceHuggingfaceTaskOperator( adb_client=self.adb_client, huggingface_interface=self.huggingface_interface @@ -106,13 +106,18 @@ async def get_task_operators(self) -> list[TaskOperatorBase]: #endregion #region Tasks + async def set_task_status(self, task_type: TaskType): + self.task_status = task_type + async def run_tasks(self): operators = await self.get_task_operators() count = 0 for operator in operators: + await self.set_task_status(task_type=operator.task_type) meets_prereq = await operator.meets_task_prerequisites() while meets_prereq: + print(f"Running {operator.task_type.value} Task") if count > TASK_REPEAT_THRESHOLD: self.discord_poster.post_to_discord( message=f"Task {operator.task_type.value} has been run" @@ -124,6 +129,7 @@ async def run_tasks(self): await self.conclude_task(run_info) count += 1 meets_prereq = await operator.meets_task_prerequisites() + await self.set_task_status(task_type=TaskType.IDLE) async def trigger_task_run(self): await self.task_trigger.trigger_or_rerun() diff --git a/core/classes/URLHTMLTaskOperator.py b/core/classes/URLHTMLTaskOperator.py index 63321635..ad279f9d 100644 --- a/core/classes/URLHTMLTaskOperator.py +++ b/core/classes/URLHTMLTaskOperator.py @@ -29,7 +29,6 @@ async def meets_task_prerequisites(self): return await self.adb_client.has_pending_urls_without_html_data() async def inner_task_logic(self): - print("Running URL HTML Task...") tdos = await self.get_pending_urls_without_html_data() url_ids = [task_info.url_info.id for task_info in tdos] await self.link_urls_to_task(url_ids=url_ids) diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 02a51b29..f8ada6ae 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -5,6 +5,7 @@ from starlette.testclient import TestClient from collector_db.DTOs.BatchInfo import BatchInfo +from collector_db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO @@ -281,4 +282,10 @@ async def reject_and_get_next_source_for_review( url=f"/review/reject-source", json=review_info.model_dump(mode='json') ) - return GetNextURLForFinalReviewOuterResponse(**data) \ No newline at end of file + return GetNextURLForFinalReviewOuterResponse(**data) + + async def get_current_task_status(self) -> GetTaskStatusResponseInfo: + data = self.get( + url=f"/task/status" + ) + return GetTaskStatusResponseInfo(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_task.py b/tests/test_automated/integration/api/test_task.py index d6e13b1f..547b0eb8 100644 --- a/tests/test_automated/integration/api/test_task.py +++ b/tests/test_automated/integration/api/test_task.py @@ -39,3 +39,17 @@ async def test_get_tasks(api_test_helper): assert task.type == TaskType.HTML assert task.url_count == 3 assert task.url_error_count == 1 + +@pytest.mark.asyncio +async def test_get_task_status(api_test_helper): + ath = api_test_helper + + response = await ath.request_validator.get_current_task_status() + + assert response.status == TaskType.IDLE + + for task in [task for task in TaskType]: + await ath.async_core.task_manager.set_task_status(task) + response = await ath.request_validator.get_current_task_status() + + assert response.status == task From 07a4a09a2a8a404cc69349404a3a36aa641bd9b3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 20:36:17 -0400 Subject: [PATCH 104/182] feat(app): add task status endpoint --- collector_db/StatementComposer.py | 21 +++++++++++++------ security_manager/SecurityManager.py | 1 - .../integration/api/test_example_collector.py | 4 ++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index b2b7e706..a84df5a1 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -3,10 +3,11 @@ from sqlalchemy import Select, select, exists, Table, func, Subquery, and_ from sqlalchemy.orm import aliased -from collector_db.enums import URLMetadataAttributeType, ValidationStatus +from collector_db.enums import URLMetadataAttributeType, ValidationStatus, TaskType from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch, \ - ConfirmedURLAgency + ConfirmedURLAgency, LinkTaskURL, Task from collector_manager.enums import URLStatus, CollectorType +from core.enums import BatchStatus class StatementComposer: @@ -16,11 +17,19 @@ class StatementComposer: @staticmethod def pending_urls_without_html_data() -> Select: - return (select(URL). - outerjoin(URLHTMLContent). - where(URLHTMLContent.id == None). - where(URL.outcome == URLStatus.PENDING.value)) + subquery = (select(1). + select_from(LinkTaskURL). + join(Task, LinkTaskURL.task_id == Task.id). + where(LinkTaskURL.url_id == URL.id). + where(Task.task_type == TaskType.HTML.value). + where(Task.task_status == BatchStatus.COMPLETE.value) + ) + query = select(URL).where( + ~exists(subquery) + ) + + return query @staticmethod diff --git a/security_manager/SecurityManager.py b/security_manager/SecurityManager.py index 8d80f46c..18bc6a26 100644 --- a/security_manager/SecurityManager.py +++ b/security_manager/SecurityManager.py @@ -39,7 +39,6 @@ def __init__( def validate_token(self, token: str) -> AccessInfo: try: payload = jwt.decode(token, self.secret_key, algorithms=[ALGORITHM]) - print(payload) return self.payload_to_access_info(payload) except InvalidTokenError as e: raise HTTPException( diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index a235d8e8..d1466c8c 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -24,7 +24,7 @@ async def test_example_collector(api_test_helper): # Temporarily disable task trigger disable_task_trigger(ath) - logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient()) + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient(), flush_interval=1) await logger.__aenter__() ath.async_core.collector_manager.logger = logger @@ -93,7 +93,7 @@ async def test_example_collector_error(api_test_helper, monkeypatch): """ ath = api_test_helper - logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient()) + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient(), flush_interval=1) await logger.__aenter__() ath.async_core.collector_manager.logger = logger From 43f91784ab38556d12c5ecf737802a006290995a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 20:46:21 -0400 Subject: [PATCH 105/182] fix(app): fix bug with task repetition count --- core/TaskManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/TaskManager.py b/core/TaskManager.py index 64aa57e6..1440df89 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -111,8 +111,8 @@ async def set_task_status(self, task_type: TaskType): async def run_tasks(self): operators = await self.get_task_operators() - count = 0 for operator in operators: + count = 0 await self.set_task_status(task_type=operator.task_type) meets_prereq = await operator.meets_task_prerequisites() From c84a7bd9bfeca9f85832235df7360271a406e846 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 14 Apr 2025 21:27:31 -0400 Subject: [PATCH 106/182] fix(app): temporarily disable HTML Task Operator --- core/TaskManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/TaskManager.py b/core/TaskManager.py index 1440df89..21698156 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -96,7 +96,7 @@ async def get_url_miscellaneous_metadata_task_operator(self): async def get_task_operators(self) -> list[TaskOperatorBase]: return [ - await self.get_url_html_task_operator(), + # await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), From a4f3be78b41ddd96e96e4225c1509988569b12fc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 15 Apr 2025 07:26:37 -0400 Subject: [PATCH 107/182] fix(app): fix bug with task repeating Also set it up so that if task errors out, it does not repeat, and logs to discord. --- collector_db/StatementComposer.py | 12 ++- core/CoreLogger.py | 97 ------------------- core/TaskManager.py | 6 +- .../test_autogoogler_collector.py | 1 - .../source_collectors/test_ckan_collector.py | 2 - .../test_common_crawler_collector.py | 2 - .../test_muckrock_collectors.py | 1 - tests/test_automated/integration/conftest.py | 17 ++-- .../integration/core/test_core_logger.py | 66 ------------- .../unit/collector_manager/__init__.py | 0 .../test_collector_manager.py | 0 .../unit/core/test_core_logger.py | 96 ++++-------------- .../test_autogoogler_collector.py | 2 - .../source_collectors/test_ckan_collector.py | 2 - .../test_common_crawl_collector.py | 2 - .../test_example_collector.py | 3 +- .../test_muckrock_collectors.py | 2 - 17 files changed, 39 insertions(+), 272 deletions(-) delete mode 100644 core/CoreLogger.py delete mode 100644 tests/test_automated/integration/core/test_core_logger.py delete mode 100644 tests/test_automated/unit/collector_manager/__init__.py delete mode 100644 tests/test_automated/unit/collector_manager/test_collector_manager.py diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index a84df5a1..d108a3fa 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -17,18 +17,22 @@ class StatementComposer: @staticmethod def pending_urls_without_html_data() -> Select: - subquery = (select(1). + exclude_subquery = (select(1). select_from(LinkTaskURL). join(Task, LinkTaskURL.task_id == Task.id). where(LinkTaskURL.url_id == URL.id). where(Task.task_type == TaskType.HTML.value). where(Task.task_status == BatchStatus.COMPLETE.value) ) - - query = select(URL).where( - ~exists(subquery) + query = ( + select(URL). + outerjoin(URLHTMLContent). + where(URLHTMLContent.id == None). + where(~exists(exclude_subquery)). + where(URL.outcome == URLStatus.PENDING.value) ) + return query diff --git a/core/CoreLogger.py b/core/CoreLogger.py deleted file mode 100644 index 79263c78..00000000 --- a/core/CoreLogger.py +++ /dev/null @@ -1,97 +0,0 @@ - - -import queue -import threading -import time -from concurrent.futures import Future -from concurrent.futures.thread import ThreadPoolExecutor - -from collector_db.DTOs.LogInfo import LogInfo -from collector_db.DatabaseClient import DatabaseClient - - -class CoreLogger: - def __init__( - self, - db_client: DatabaseClient, - flush_interval=10, - batch_size=100 - ): - self.db_client = db_client - self.flush_interval = flush_interval - self.batch_size = batch_size - - self.log_queue = queue.Queue() - self.lock = threading.Lock() - self.stop_event = threading.Event() - # Start the periodic flush task - self.executor = ThreadPoolExecutor(max_workers=1) - self.flush_future: Future = self.executor.submit(self._flush_logs) - - def __enter__(self): - """ - Start the logger for use in a context. - """ - return self - - def __exit__(self, exc_type, exc_value, traceback): - """ - Gracefully shut down the logger when exiting the context. - """ - self.shutdown() - - def log(self, log_info: LogInfo): - """ - Adds a log entry to the queue. - """ - self.log_queue.put(log_info) - - def _flush_logs(self): - """ - Periodically flushes logs from the queue to the database. - """ - while not self.stop_event.is_set(): - time.sleep(self.flush_interval) - self.flush() - - def flush(self): - """ - Flushes all logs from the queue to the database in batches. - """ - with self.lock: - logs: list[LogInfo] = [] - while not self.log_queue.empty() and len(logs) < self.batch_size: - try: - log = self.log_queue.get_nowait() - logs.append(log) - except queue.Empty: - break - - if logs: - try: - self.db_client.insert_logs(log_infos=logs) - except Exception as e: - # Handle logging database errors (e.g., save to fallback storage) - print(f"Error while flushing logs: {e}") - - def flush_all(self): - """ - Flushes all logs from the queue to the database. - """ - while not self.log_queue.empty(): - self.flush() - - def restart(self): - self.flush_all() - self.executor.shutdown(wait=False) - self.executor = ThreadPoolExecutor(max_workers=1) - self.flush_future = self.executor.submit(self._flush_logs) - - def shutdown(self): - """ - Stops the logger gracefully and flushes any remaining logs. - """ - self.stop_event.set() - # if self.flush_future and not self.flush_future.done(): - self.flush_future.result(timeout=10) - self.flush_all() # Flush remaining logs diff --git a/core/TaskManager.py b/core/TaskManager.py index 64aa57e6..77844d91 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -111,8 +111,8 @@ async def set_task_status(self, task_type: TaskType): async def run_tasks(self): operators = await self.get_task_operators() - count = 0 for operator in operators: + count = 0 await self.set_task_status(task_type=operator.task_type) meets_prereq = await operator.meets_task_prerequisites() @@ -127,6 +127,8 @@ async def run_tasks(self): task_id = await self.initiate_task_in_db(task_type=operator.task_type) run_info: TaskOperatorRunInfo = await operator.run_task(task_id) await self.conclude_task(run_info) + if run_info.outcome == TaskOperatorOutcome.ERROR: + break count += 1 meets_prereq = await operator.meets_task_prerequisites() await self.set_task_status(task_type=TaskType.IDLE) @@ -165,6 +167,8 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): task_id=run_info.task_id, error=run_info.message ) + await self.discord_poster.post_to_discord( + message=f"Task {run_info.task_id} ({self.task_status.value}) failed with error.") async def get_task_info(self, task_id: int) -> TaskInfo: return await self.adb_client.get_task_info(task_id=task_id) diff --git a/tests/manual/source_collectors/test_autogoogler_collector.py b/tests/manual/source_collectors/test_autogoogler_collector.py index a51fc883..c9942106 100644 --- a/tests/manual/source_collectors/test_autogoogler_collector.py +++ b/tests/manual/source_collectors/test_autogoogler_collector.py @@ -4,7 +4,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.auto_googler.AutoGooglerCollector import AutoGooglerCollector from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index f642fd8d..f9deaf02 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -4,9 +4,7 @@ from marshmallow import Schema, fields from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector from source_collectors.ckan.DTOs import CKANInputDTO from source_collectors.ckan.search_terms import package_search, group_search, organization_search diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index 872b7710..cb1c4f78 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -4,9 +4,7 @@ from marshmallow import Schema, fields from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.common_crawler.CommonCrawlerCollector import CommonCrawlerCollector from source_collectors.common_crawler.DTOs import CommonCrawlerInputDTO diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index bfd0ba26..49bfa5fb 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -4,7 +4,6 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.muckrock.DTOs import MuckrockSimpleSearchCollectorInputDTO, \ MuckrockCountySearchCollectorInputDTO, MuckrockAllFOIARequestsCollectorInputDTO from source_collectors.muckrock.classes.MuckrockCollector import MuckrockSimpleSearchCollector, \ diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index 6be03e86..a0180800 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -6,22 +6,17 @@ from collector_manager.AsyncCollectorManager import AsyncCollectorManager from core.AsyncCore import AsyncCore from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from core.SourceCollectorCore import SourceCollectorCore @pytest.fixture def test_core(db_client_test): - with CoreLogger( - db_client=db_client_test - ) as logger: - core = SourceCollectorCore( - db_client=db_client_test, - core_logger=logger, - dev_mode=True - ) - yield core - core.shutdown() + core = SourceCollectorCore( + db_client=db_client_test, + dev_mode=True + ) + yield core + core.shutdown() @pytest.fixture diff --git a/tests/test_automated/integration/core/test_core_logger.py b/tests/test_automated/integration/core/test_core_logger.py deleted file mode 100644 index 07a98000..00000000 --- a/tests/test_automated/integration/core/test_core_logger.py +++ /dev/null @@ -1,66 +0,0 @@ -import threading -import time - -from collector_db.DTOs.LogInfo import LogInfo -from core.CoreLogger import CoreLogger -from tests.helpers.DBDataCreator import DBDataCreator - - -def test_logger_integration(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - db_client = db_data_creator.db_client - with CoreLogger(flush_interval=1, db_client=db_client) as logger: - - # Simulate logging - logger.log(LogInfo(log="Integration Log 1", batch_id=batch_id)) - logger.log(LogInfo(log="Integration Log 2", batch_id=batch_id)) - - # Wait for the flush interval - time.sleep(1.5) - - # Verify logs in the database - logs = db_client.get_logs_by_batch_id(batch_id) - assert len(logs) == 2 - assert logs[0].log == "Integration Log 1" - - -def test_multithreaded_integration_with_live_db(db_data_creator: DBDataCreator): - # Ensure the database is empty - db_client = db_data_creator.db_client - db_client.delete_all_logs() - - batch_ids = [db_data_creator.batch() for _ in range(5)] - db_client = db_data_creator.db_client - logger = CoreLogger(flush_interval=1, db_client=db_client, batch_size=10) - - # Simulate multiple threads logging - def worker(thread_id): - batch_id = batch_ids[thread_id-1] - for i in range(10): # Each thread logs 10 messages - logger.log(LogInfo(log=f"Thread-{thread_id} Log-{i}", batch_id=batch_id)) - - # Start multiple threads - threads = [threading.Thread(target=worker, args=(i+1,)) for i in range(5)] # 5 threads - for t in threads: - t.start() - for t in threads: - t.join() - - # Allow the logger to flush - logger.shutdown() - time.sleep(10) - - # Verify logs in the database - logs = db_client.get_all_logs() - - # Optional: Print logs for manual inspection - for log in logs: - print(log.log) - - # Assertions - assert len(logs) == 50 # 5 threads * 10 messages each - for i in range(1,6): - for j in range(10): - assert any(log.log == f"Thread-{i} Log-{j}" for log in logs) - - diff --git a/tests/test_automated/unit/collector_manager/__init__.py b/tests/test_automated/unit/collector_manager/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_automated/unit/collector_manager/test_collector_manager.py b/tests/test_automated/unit/collector_manager/test_collector_manager.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_automated/unit/core/test_core_logger.py b/tests/test_automated/unit/core/test_core_logger.py index 22d08bfb..d91ce6cd 100644 --- a/tests/test_automated/unit/core/test_core_logger.py +++ b/tests/test_automated/unit/core/test_core_logger.py @@ -1,86 +1,28 @@ -import threading -import time -from unittest.mock import MagicMock +import asyncio +from unittest.mock import AsyncMock -from collector_db.DTOs.LogInfo import LogInfo -from core.CoreLogger import CoreLogger - - -def test_logger_flush(): - mock_db_client = MagicMock() - logger = CoreLogger(flush_interval=1, db_client=mock_db_client) - - # Add logs - logger.log(LogInfo(log="Log 1", batch_id=1)) - logger.log(LogInfo(log="Log 2", batch_id=1)) - - # Wait for the flush interval - time.sleep(1.5) - - # Verify logs were flushed - assert mock_db_client.insert_logs.called - flushed_logs = mock_db_client.insert_logs.call_args[1]['log_infos'] - assert len(flushed_logs) == 2 - assert flushed_logs[0].log == "Log 1" - - logger.shutdown() - -def test_logger_multithreading(): - mock_db_client = MagicMock() - logger = CoreLogger(flush_interval=1, db_client=mock_db_client, batch_size=10) - - def worker(thread_id): - for i in range(5): # Each thread logs 5 messages - logger.log(LogInfo(log=f"Thread-{thread_id} Log-{i}", batch_id=thread_id)) - - # Start multiple threads - threads = [threading.Thread(target=worker, args=(i,)) for i in range(5)] # 5 threads - for t in threads: - t.start() - for t in threads: - t.join() # Wait for all threads to finish - - # Allow the logger to flush - time.sleep(2) - logger.shutdown() - - # Verify all logs were flushed - assert mock_db_client.insert_logs.called - flushed_logs = [] - for call in mock_db_client.insert_logs.call_args_list: - flushed_logs.extend(call[1]['log_infos']) - - # Ensure all logs are present - assert len(flushed_logs) == 25 # 5 threads * 5 messages each - for i in range(5): - for j in range(5): - assert any(log.log == f"Thread-{i} Log-{j}" for log in flushed_logs) +import pytest +from collector_db.DTOs.LogInfo import LogInfo +from core.AsyncCoreLogger import AsyncCoreLogger -def test_logger_with_delays(): - mock_db_client = MagicMock() - logger = CoreLogger(flush_interval=1, db_client=mock_db_client, batch_size=10) - def worker(thread_id): - for i in range(10): # Each thread logs 10 messages - logger.log(LogInfo(log=f"Thread-{thread_id} Log-{i}", batch_id=thread_id)) - time.sleep(0.1) # Simulate delay between logs +@pytest.mark.asyncio +async def test_logger_flush(): + mock_adb_client = AsyncMock() + async with AsyncCoreLogger(flush_interval=1, adb_client=mock_adb_client) as logger: - # Start multiple threads - threads = [threading.Thread(target=worker, args=(i,)) for i in range(5)] # 5 threads - for t in threads: - t.start() - for t in threads: - t.join() # Wait for all threads to finish + # Add logs + await logger.log(LogInfo(log="Log 1", batch_id=1)) + await logger.log(LogInfo(log="Log 2", batch_id=1)) - # Allow the logger to flush - time.sleep(2) - logger.shutdown() + # Wait for the flush interval + await asyncio.sleep(1.5) - # Verify that all logs are eventually flushed - flushed_logs = [] - for call in mock_db_client.insert_logs.call_args_list: - flushed_logs.extend(call[1]['log_infos']) + # Verify logs were flushed + mock_adb_client.insert_logs.assert_called_once() + flushed_logs = mock_adb_client.insert_logs.call_args[1]['log_infos'] + assert len(flushed_logs) == 2 + assert flushed_logs[0].log == "Log 1" - assert len(flushed_logs) == 50 # 5 threads * 10 messages each diff --git a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py index 2349afe2..c3fafa61 100644 --- a/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/test_automated/unit/source_collectors/test_autogoogler_collector.py @@ -4,9 +4,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.auto_googler.AutoGooglerCollector import AutoGooglerCollector from source_collectors.auto_googler.DTOs import GoogleSearchQueryResultsInnerDTO, AutoGooglerInputDTO diff --git a/tests/test_automated/unit/source_collectors/test_ckan_collector.py b/tests/test_automated/unit/source_collectors/test_ckan_collector.py index ef7dbee8..e0e9ee47 100644 --- a/tests/test_automated/unit/source_collectors/test_ckan_collector.py +++ b/tests/test_automated/unit/source_collectors/test_ckan_collector.py @@ -5,9 +5,7 @@ import pytest from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.ckan.CKANCollector import CKANCollector from source_collectors.ckan.DTOs import CKANInputDTO diff --git a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py index d1f0ccda..1c5aa6ee 100644 --- a/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/test_automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,9 +4,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.common_crawler.CommonCrawlerCollector import CommonCrawlerCollector from source_collectors.common_crawler.DTOs import CommonCrawlerInputDTO diff --git a/tests/test_automated/unit/source_collectors/test_example_collector.py b/tests/test_automated/unit/source_collectors/test_example_collector.py index 26ca601d..b770d952 100644 --- a/tests/test_automated/unit/source_collectors/test_example_collector.py +++ b/tests/test_automated/unit/source_collectors/test_example_collector.py @@ -1,10 +1,9 @@ -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import AsyncMock from collector_db.DatabaseClient import DatabaseClient from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.ExampleCollector import ExampleCollector from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger def test_example_collector(): diff --git a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py index 7e533efa..100fbb6e 100644 --- a/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/test_automated/unit/source_collectors/test_muckrock_collectors.py @@ -5,9 +5,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCoreLogger import AsyncCoreLogger -from core.CoreLogger import CoreLogger from source_collectors.muckrock.DTOs import MuckrockSimpleSearchCollectorInputDTO, \ MuckrockCountySearchCollectorInputDTO, MuckrockAllFOIARequestsCollectorInputDTO from source_collectors.muckrock.classes.MuckrockCollector import MuckrockSimpleSearchCollector, \ From 82e65b9f13e008821c0de9ee429cefe9c0511cfa Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 15 Apr 2025 16:47:03 -0400 Subject: [PATCH 108/182] feat(app): add submit approved URL task --- ...3794fa4e9_add_submit_url_task_type_enum.py | 48 +++++ ...33d2e_revert_to_pending_validated_urls_.py | 42 +++++ api/main.py | 32 +++- api/routes/batch.py | 18 +- collector_db/AsyncDatabaseClient.py | 111 +++++++++-- collector_db/DTOs/URLInfo.py | 1 + collector_db/DatabaseClient.py | 96 +--------- collector_db/helper_functions.py | 13 +- collector_db/models.py | 6 +- core/AsyncCore.py | 29 ++- .../task_data_objects/SubmitApprovedURLTDO.py | 9 +- core/EnvVarManager.py | 76 ++++++++ core/SourceCollectorCore.py | 27 +-- core/TaskManager.py | 29 +-- core/classes/SubmitApprovedURLTaskOperator.py | 42 +++-- core/enums.py | 18 +- html_tag_collector/RootURLCache.py | 4 +- llm_api_logic/OpenAIRecordClassifier.py | 5 +- pdap_api_client/AccessManager.py | 4 +- pdap_api_client/DTOs.py | 1 + pdap_api_client/PDAPClient.py | 65 +++++-- start_mirrored_local_app.py | 62 ++++--- tests/conftest.py | 31 +++- tests/helpers/DBDataCreator.py | 12 +- .../integration/api/conftest.py | 12 +- .../integration/api/test_annotate.py | 3 - .../integration/api/test_duplicates.py | 3 +- .../collector_db/test_database_structure.py | 9 +- .../collector_db/test_db_client.py | 17 +- .../integration/core/test_async_core.py | 1 + .../tasks/test_submit_approved_url_task.py | 174 +++++++++++------- util/DiscordNotifier.py | 8 +- util/helper_functions.py | 13 ++ 33 files changed, 687 insertions(+), 334 deletions(-) create mode 100644 alembic/versions/2025_04_15_1338-b363794fa4e9_add_submit_url_task_type_enum.py create mode 100644 alembic/versions/2025_04_15_1532-ed06a5633d2e_revert_to_pending_validated_urls_.py create mode 100644 core/EnvVarManager.py diff --git a/alembic/versions/2025_04_15_1338-b363794fa4e9_add_submit_url_task_type_enum.py b/alembic/versions/2025_04_15_1338-b363794fa4e9_add_submit_url_task_type_enum.py new file mode 100644 index 00000000..e1d5b725 --- /dev/null +++ b/alembic/versions/2025_04_15_1338-b363794fa4e9_add_submit_url_task_type_enum.py @@ -0,0 +1,48 @@ +"""Add Submit URL Task Type Enum + +Revision ID: b363794fa4e9 +Revises: 33a546c93441 +Create Date: 2025-04-15 13:38:58.293627 + +""" +from typing import Sequence, Union + + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = 'b363794fa4e9' +down_revision: Union[str, None] = '33a546c93441' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + "HTML", + "Relevancy", + "Record Type", + "Agency Identification", + "Misc Metadata", + "Submit Approved URLs" + ] + ) + + +def downgrade() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + "HTML", + "Relevancy", + "Record Type", + "Agency Identification", + "Misc Metadata", + ] + ) \ No newline at end of file diff --git a/alembic/versions/2025_04_15_1532-ed06a5633d2e_revert_to_pending_validated_urls_.py b/alembic/versions/2025_04_15_1532-ed06a5633d2e_revert_to_pending_validated_urls_.py new file mode 100644 index 00000000..82ce97a4 --- /dev/null +++ b/alembic/versions/2025_04_15_1532-ed06a5633d2e_revert_to_pending_validated_urls_.py @@ -0,0 +1,42 @@ +"""Revert to pending validated URLs without name and add constraint + +Revision ID: ed06a5633d2e +Revises: b363794fa4e9 +Create Date: 2025-04-15 15:32:26.465488 + +""" +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = 'ed06a5633d2e' +down_revision: Union[str, None] = 'b363794fa4e9' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + + op.execute( + """ + UPDATE public.urls + SET OUTCOME = 'pending' + WHERE OUTCOME = 'validated' AND NAME IS NULL + """ + ) + + op.create_check_constraint( + 'url_name_not_null_when_validated', + 'urls', + "NAME IS NOT NULL OR OUTCOME != 'validated'" + ) + + +def downgrade() -> None: + op.drop_constraint( + 'url_name_not_null_when_validated', + 'urls', + type_='check' + ) diff --git a/api/main.py b/api/main.py index 19f8de8d..40970e4f 100644 --- a/api/main.py +++ b/api/main.py @@ -1,5 +1,6 @@ from contextlib import asynccontextmanager +import aiohttp import uvicorn from fastapi import FastAPI @@ -15,6 +16,7 @@ from collector_manager.AsyncCollectorManager import AsyncCollectorManager from core.AsyncCore import AsyncCore from core.AsyncCoreLogger import AsyncCoreLogger +from core.EnvVarManager import EnvVarManager from core.ScheduledTaskManager import AsyncScheduledTaskManager from core.SourceCollectorCore import SourceCollectorCore from core.TaskManager import TaskManager @@ -22,18 +24,27 @@ from html_tag_collector.RootURLCache import RootURLCache from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.PDAPClient import PDAPClient from util.DiscordNotifier import DiscordPoster -from util.helper_functions import get_from_env + @asynccontextmanager async def lifespan(app: FastAPI): + env_var_manager = EnvVarManager.get() + # Initialize shared dependencies - db_client = DatabaseClient() - adb_client = AsyncDatabaseClient() + db_client = DatabaseClient( + db_url=env_var_manager.get_postgres_connection_string() + ) + adb_client = AsyncDatabaseClient( + db_url=env_var_manager.get_postgres_connection_string(is_async=True) + ) await setup_database(db_client) core_logger = AsyncCoreLogger(adb_client=adb_client) + session = aiohttp.ClientSession() source_collector_core = SourceCollectorCore( db_client=DatabaseClient(), @@ -46,7 +57,15 @@ async def lifespan(app: FastAPI): root_url_cache=RootURLCache() ), discord_poster=DiscordPoster( - webhook_url=get_from_env("DISCORD_WEBHOOK_URL") + webhook_url=env_var_manager.discord_webhook_url + ), + pdap_client=PDAPClient( + access_manager=AccessManager( + email=env_var_manager.pdap_email, + password=env_var_manager.pdap_password, + api_key=env_var_manager.pdap_api_key, + session=session + ) ) ) async_collector_manager = AsyncCollectorManager( @@ -72,17 +91,17 @@ async def lifespan(app: FastAPI): yield # Code here runs before shutdown # Shutdown logic (if needed) + # Clean up resources, close connections, etc. await core_logger.shutdown() await async_core.shutdown() source_collector_core.shutdown() - # Clean up resources, close connections, etc. + await session.close() pass async def setup_database(db_client): # Initialize database if dev environment, otherwise apply migrations try: - get_from_env("DEV") db_client.init_db() except Exception as e: return @@ -95,6 +114,7 @@ async def setup_database(db_client): lifespan=lifespan ) + routers = [ root_router, collector_router, diff --git a/api/routes/batch.py b/api/routes/batch.py index 23df2394..9d4b62cc 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -25,7 +25,7 @@ @batch_router.get("") -def get_batch_status( +async def get_batch_status( collector_type: Optional[CollectorType] = Query( description="Filter by collector type", default=None @@ -38,13 +38,13 @@ def get_batch_status( description="The page number", default=1 ), - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> GetBatchStatusResponse: """ Get the status of recent batches """ - return core.get_batch_statuses(collector_type=collector_type, status=status, page=page) + return await core.get_batch_statuses(collector_type=collector_type, status=status, page=page) @batch_router.get("/{batch_id}") @@ -69,28 +69,28 @@ async def get_urls_by_batch( return await core.get_urls_by_batch(batch_id, page=page) @batch_router.get("/{batch_id}/duplicates") -def get_duplicates_by_batch( +async def get_duplicates_by_batch( batch_id: int = Path(description="The batch id"), page: int = Query( description="The page number", default=1 ), - core: SourceCollectorCore = Depends(get_core), + core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> GetDuplicatesByBatchResponse: - return core.get_duplicate_urls_by_batch(batch_id, page=page) + return await core.get_duplicate_urls_by_batch(batch_id, page=page) @batch_router.get("/{batch_id}/logs") -def get_batch_logs( +async def get_batch_logs( batch_id: int = Path(description="The batch id"), - core: SourceCollectorCore = Depends(get_core), + async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> GetBatchLogsResponse: """ Retrieve the logs for a recent batch. Note that for later batches, the logs may not be available. """ - return core.get_batch_logs(batch_id) + return await async_core.get_batch_logs(batch_id) @batch_router.post("/{batch_id}/abort") async def abort_batch( diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index c44468a4..98410b6f 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -5,16 +5,16 @@ from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute +from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased from sqlalchemy.sql.functions import coalesce from starlette import status from collector_db.ConfigManager import ConfigManager from collector_db.DTOConverter import DTOConverter from collector_db.DTOs.BatchInfo import BatchInfo -from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo +from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo, DuplicateInfo from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo -from collector_db.DTOs.LogInfo import LogInfo +from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType @@ -41,8 +41,9 @@ GetURLsResponseInnerInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO -from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo +from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus, SuggestionType, RecordType from html_tag_collector.DataClassTags import convert_to_response_html_info @@ -58,7 +59,9 @@ def add_standard_limit_and_offset(statement, page, limit=100): class AsyncDatabaseClient: - def __init__(self, db_url: str = get_postgres_connection_string(is_async=True)): + def __init__(self, db_url: Optional[str] = None): + if db_url is None: + db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) self.engine = create_async_engine( url=db_url, echo=ConfigManager.get_sqlalchemy_echo(), @@ -1487,8 +1490,9 @@ async def get_validated_urls( .where(URL.outcome == URLStatus.VALIDATED.value) .options( selectinload(URL.optional_data_source_metadata), - selectinload(URL.confirmed_agencies) - ) + selectinload(URL.confirmed_agencies), + selectinload(URL.reviewing_user) + ).limit(100) ) urls = await session.execute(query) urls = urls.scalars().all() @@ -1497,6 +1501,17 @@ async def get_validated_urls( agency_ids = [] for agency in url.confirmed_agencies: agency_ids.append(agency.agency_id) + optional_metadata = url.optional_data_source_metadata + + if optional_metadata is None: + record_formats = None + data_portal_type = None + supplying_entity = None + else: + record_formats = optional_metadata.record_formats + data_portal_type = optional_metadata.data_portal_type + supplying_entity = optional_metadata.supplying_entity + tdo = SubmitApprovedURLTDO( url_id=url.id, url=url.url, @@ -1504,18 +1519,19 @@ async def get_validated_urls( agency_ids=agency_ids, description=url.description, record_type=url.record_type, - record_formats=url.optional_data_source_metadata.record_formats, - data_portal_type=url.optional_data_source_metadata.data_portal_type, - supplying_entity=url.optional_data_source_metadata.supplying_entity, + record_formats=record_formats, + data_portal_type=data_portal_type, + supplying_entity=supplying_entity, + approving_user_id=url.reviewing_user.user_id ) results.append(tdo) return results @session_manager - async def mark_urls_as_submitted(self, session: AsyncSession, tdos: list[SubmitApprovedURLTDO]): - for tdo in tdos: - url_id = tdo.url_id - data_source_id = tdo.data_source_id + async def mark_urls_as_submitted(self, session: AsyncSession, infos: list[SubmittedURLInfo]): + for info in infos: + url_id = info.url_id + data_source_id = info.data_source_id query = ( update(URL) .where(URL.id == url_id) @@ -1526,3 +1542,70 @@ async def mark_urls_as_submitted(self, session: AsyncSession, tdos: list[SubmitA ) await session.execute(query) + @session_manager + async def get_duplicates_by_batch_id(self, session, batch_id: int, page: int) -> List[DuplicateInfo]: + original_batch = aliased(Batch) + duplicate_batch = aliased(Batch) + + query = ( + Select( + URL.url.label("source_url"), + URL.id.label("original_url_id"), + duplicate_batch.id.label("duplicate_batch_id"), + duplicate_batch.parameters.label("duplicate_batch_parameters"), + original_batch.id.label("original_batch_id"), + original_batch.parameters.label("original_batch_parameters"), + ) + .select_from(Duplicate) + .join(URL, Duplicate.original_url_id == URL.id) + .join(duplicate_batch, Duplicate.batch_id == duplicate_batch.id) + .join(original_batch, URL.batch_id == original_batch.id) + .filter(duplicate_batch.id == batch_id) + .limit(100) + .offset((page - 1) * 100) + ) + raw_results = await session.execute(query) + results = raw_results.all() + final_results = [] + for result in results: + final_results.append( + DuplicateInfo( + source_url=result.source_url, + duplicate_batch_id=result.duplicate_batch_id, + duplicate_metadata=result.duplicate_batch_parameters, + original_batch_id=result.original_batch_id, + original_metadata=result.original_batch_parameters, + original_url_id=result.original_url_id + ) + ) + return final_results + + @session_manager + async def get_recent_batch_status_info( + self, + session, + page: int, + collector_type: Optional[CollectorType] = None, + status: Optional[BatchStatus] = None, + ) -> List[BatchInfo]: + # Get only the batch_id, collector_type, status, and created_at + limit = 100 + query = (Select(Batch) + .order_by(Batch.date_generated.desc())) + if collector_type: + query = query.filter(Batch.strategy == collector_type.value) + if status: + query = query.filter(Batch.status == status.value) + query = (query.limit(limit) + .offset((page - 1) * limit)) + raw_results = await session.execute(query) + batches = raw_results.scalars().all() + return [BatchInfo(**batch.__dict__) for batch in batches] + + @session_manager + async def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputInfo]: + query = Select(Log).filter_by(batch_id=batch_id).order_by(Log.created_at.asc()) + raw_results = await session.execute(query) + logs = raw_results.scalars().all() + return ([LogOutputInfo(**log.__dict__) for log in logs]) + diff --git a/collector_db/DTOs/URLInfo.py b/collector_db/DTOs/URLInfo.py index afe6c2f2..c47d2830 100644 --- a/collector_db/DTOs/URLInfo.py +++ b/collector_db/DTOs/URLInfo.py @@ -13,3 +13,4 @@ class URLInfo(BaseModel): collector_metadata: Optional[dict] = None outcome: URLStatus = URLStatus.PENDING updated_at: Optional[datetime.datetime] = None + name: Optional[str] = None diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 8d72ef0d..b8547f1d 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -16,13 +16,17 @@ from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType +from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus # Database Client class DatabaseClient: - def __init__(self, db_url: str = get_postgres_connection_string()): + def __init__(self, db_url: Optional[str] = None): """Initialize the DatabaseClient.""" + if db_url is None: + db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) + self.engine = create_engine( url=db_url, echo=ConfigManager.get_sqlalchemy_echo(), @@ -49,10 +53,6 @@ def wrapper(self, *args, **kwargs): return wrapper - def row_to_dict(self, row: Row) -> dict: - return dict(row._mapping) - - @session_manager def insert_batch(self, session, batch_info: BatchInfo) -> int: """Insert a new batch into the database and return its ID.""" @@ -105,24 +105,14 @@ def insert_url(self, session, url_info: URLInfo) -> int: batch_id=url_info.batch_id, url=url_info.url, collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value + outcome=url_info.outcome.value, + name=url_info.name ) session.add(url_entry) session.commit() session.refresh(url_entry) return url_entry.id - @session_manager - def add_duplicate_info(self, session, duplicate_infos: list[DuplicateInfo]): - # TODO: Add test for this method when testing CollectorDatabaseProcessor - for duplicate_info in duplicate_infos: - duplicate = Duplicate( - batch_id=duplicate_info.original_batch_id, - original_url_id=duplicate_info.original_url_id, - ) - session.add(duplicate) - - def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: url_mappings = [] duplicates = [] @@ -163,83 +153,11 @@ def insert_logs(self, session, log_infos: List[LogInfo]): log.created_at = log_info.created_at session.add(log) - @session_manager - def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputInfo]: - logs = session.query(Log).filter_by(batch_id=batch_id).order_by(Log.created_at.asc()).all() - return ([LogOutputInfo(**log.__dict__) for log in logs]) - - @session_manager - def get_all_logs(self, session) -> List[LogInfo]: - logs = session.query(Log).all() - return ([LogInfo(**log.__dict__) for log in logs]) - @session_manager def get_batch_status(self, session, batch_id: int) -> BatchStatus: batch = session.query(Batch).filter_by(id=batch_id).first() return BatchStatus(batch.status) - @session_manager - def get_recent_batch_status_info( - self, - session, - page: int, - collector_type: Optional[CollectorType] = None, - status: Optional[BatchStatus] = None, - ) -> List[BatchInfo]: - # Get only the batch_id, collector_type, status, and created_at - limit = 100 - query = (session.query(Batch) - .order_by(Batch.date_generated.desc())) - if collector_type: - query = query.filter(Batch.strategy == collector_type.value) - if status: - query = query.filter(Batch.status == status.value) - query = (query.limit(limit) - .offset((page - 1) * limit)) - batches = query.all() - return [BatchInfo(**batch.__dict__) for batch in batches] - - @session_manager - def get_duplicates_by_batch_id(self, session, batch_id: int, page: int) -> List[DuplicateInfo]: - original_batch = aliased(Batch) - duplicate_batch = aliased(Batch) - - query = ( - session.query( - URL.url.label("source_url"), - URL.id.label("original_url_id"), - duplicate_batch.id.label("duplicate_batch_id"), - duplicate_batch.parameters.label("duplicate_batch_parameters"), - original_batch.id.label("original_batch_id"), - original_batch.parameters.label("original_batch_parameters"), - ) - .select_from(Duplicate) - .join(URL, Duplicate.original_url_id == URL.id) - .join(duplicate_batch, Duplicate.batch_id == duplicate_batch.id) - .join(original_batch, URL.batch_id == original_batch.id) - .filter(duplicate_batch.id == batch_id) - .limit(100) - .offset((page - 1) * 100) - ) - results = query.all() - final_results = [] - for result in results: - final_results.append( - DuplicateInfo( - source_url=result.source_url, - duplicate_batch_id=result.duplicate_batch_id, - duplicate_metadata=result.duplicate_batch_parameters, - original_batch_id=result.original_batch_id, - original_metadata=result.original_batch_parameters, - original_url_id=result.original_url_id - ) - ) - return final_results - - @session_manager - def delete_all_logs(self, session): - session.query(Log).delete() - @session_manager def delete_old_logs(self, session): """ diff --git a/collector_db/helper_functions.py b/collector_db/helper_functions.py index dcb161b9..4f99556a 100644 --- a/collector_db/helper_functions.py +++ b/collector_db/helper_functions.py @@ -2,15 +2,8 @@ import dotenv +from core.EnvVarManager import EnvVarManager + def get_postgres_connection_string(is_async = False): - dotenv.load_dotenv() - username = os.getenv("POSTGRES_USER") - password = os.getenv("POSTGRES_PASSWORD") - host = os.getenv("POSTGRES_HOST") - port = os.getenv("POSTGRES_PORT") - database = os.getenv("POSTGRES_DB") - driver = "postgresql" - if is_async: - driver += "+asyncpg" - return f"{driver}://{username}:{password}@{host}:{port}/{database}" \ No newline at end of file + return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/collector_db/models.py b/collector_db/models.py index e0fd1b88..e98ef437 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -129,8 +129,8 @@ class URL(Base): "AutoRelevantSuggestion", uselist=False, back_populates="url") user_relevant_suggestions = relationship( "UserRelevantSuggestion", back_populates="url") - reviewing_users = relationship( - "ReviewingUserURL", back_populates="url") + reviewing_user = relationship( + "ReviewingUserURL", uselist=False, back_populates="url") optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( @@ -164,7 +164,7 @@ class ReviewingUserURL(Base): created_at = get_created_at_column() # Relationships - url = relationship("URL", back_populates="reviewing_users") + url = relationship("URL", uselist=False, back_populates="reviewing_user") class RootURL(Base): __tablename__ = 'root_url_cache' diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 85762c85..cb9a80bc 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -10,6 +10,9 @@ from collector_manager.enums import CollectorType from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse +from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse +from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -17,11 +20,10 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.MessageResponse import MessageResponse from core.TaskManager import TaskManager -from core.enums import BatchStatus, SuggestionType, RecordType +from core.enums import BatchStatus, RecordType -from pdap_api_client.AccessManager import AccessManager -from pdap_api_client.PDAPClient import PDAPClient from security_manager.SecurityManager import AccessInfo @@ -57,6 +59,27 @@ async def abort_batch(self, batch_id: int) -> MessageResponse: await self.collector_manager.abort_collector_async(cid=batch_id) return MessageResponse(message=f"Batch aborted.") + async def get_duplicate_urls_by_batch(self, batch_id: int, page: int = 1) -> GetDuplicatesByBatchResponse: + dup_infos = await self.adb_client.get_duplicates_by_batch_id(batch_id, page=page) + return GetDuplicatesByBatchResponse(duplicates=dup_infos) + + async def get_batch_statuses( + self, + collector_type: Optional[CollectorType], + status: Optional[BatchStatus], + page: int + ) -> GetBatchStatusResponse: + results = await self.adb_client.get_recent_batch_status_info( + collector_type=collector_type, + status=status, + page=page + ) + return GetBatchStatusResponse(results=results) + + async def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse: + logs = await self.adb_client.get_logs_by_batch_id(batch_id) + return GetBatchLogsResponse(logs=logs) + #endregion # region Collector diff --git a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py index 45fa7daf..c5b002d0 100644 --- a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py +++ b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py @@ -12,7 +12,14 @@ class SubmitApprovedURLTDO(BaseModel): agency_ids: list[int] name: str description: str + approving_user_id: int record_formats: Optional[list[str]] = None data_portal_type: Optional[str] = None supplying_entity: Optional[str] = None - data_source_id: Optional[int] = None \ No newline at end of file + data_source_id: Optional[int] = None + request_error: Optional[str] = None + +class SubmittedURLInfo(BaseModel): + url_id: int + data_source_id: Optional[int] + request_error: Optional[str] \ No newline at end of file diff --git a/core/EnvVarManager.py b/core/EnvVarManager.py new file mode 100644 index 00000000..39e4ce83 --- /dev/null +++ b/core/EnvVarManager.py @@ -0,0 +1,76 @@ +import os + +class EnvVarManager: + _instance = None + _allow_direct_init = False # internal flag + + """ + A class for unified management of environment variables + """ + def __new__(cls, *args, **kwargs): + if not cls._allow_direct_init: + raise RuntimeError("Use `EnvVarManager.get()` or `EnvVarManager.override()` instead.") + return super().__new__(cls) + + def __init__(self, env: dict = os.environ): + self.env = env + self._load() + + def _load(self): + + self.google_api_key = self.require_env("GOOGLE_API_KEY") + self.google_cse_id = self.require_env("GOOGLE_CSE_ID") + + self.pdap_email = self.require_env("PDAP_EMAIL") + self.pdap_password = self.require_env("PDAP_PASSWORD") + self.pdap_api_key = self.require_env("PDAP_API_KEY") + self.pdap_api_url = self.require_env("PDAP_API_URL") + + self.discord_webhook_url = self.require_env("DISCORD_WEBHOOK_URL") + + self.openai_api_key = self.require_env("OPENAI_API_KEY") + + self.postgres_user = self.require_env("POSTGRES_USER") + self.postgres_password = self.require_env("POSTGRES_PASSWORD") + self.postgres_host = self.require_env("POSTGRES_HOST") + self.postgres_port = self.require_env("POSTGRES_PORT") + self.postgres_db = self.require_env("POSTGRES_DB") + + @classmethod + def get(cls): + """ + Get the singleton instance, loading from environment if not yet + instantiated + """ + if cls._instance is None: + cls._allow_direct_init = True + cls._instance = cls(os.environ) + cls._allow_direct_init = False + return cls._instance + + @classmethod + def override(cls, env: dict): + """ + Create singleton instance that + overrides the environment variables with injected values + """ + cls._allow_direct_init = True + cls._instance = cls(env) + cls._allow_direct_init = False + + @classmethod + def reset(cls): + cls._instance = None + + def get_postgres_connection_string(self, is_async = False): + driver = "postgresql" + if is_async: + driver += "+asyncpg" + return (f"{driver}://{self.postgres_user}:{self.postgres_password}" + f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}") + + def require_env(self, key: str, allow_none: bool = False): + val = self.env.get(key) + if val is None and not allow_none: + raise ValueError(f"Environment variable {key} is not set") + return val \ No newline at end of file diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index 8002717c..4516ceb5 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -2,10 +2,6 @@ from collector_db.DatabaseClient import DatabaseClient -from collector_manager.enums import CollectorType -from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse -from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse -from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse from core.ScheduledTaskManager import ScheduledTaskManager from core.enums import BatchStatus @@ -15,38 +11,21 @@ def __init__( self, core_logger: Optional[Any] = None, # Deprecated collector_manager: Optional[Any] = None, # Deprecated - db_client: DatabaseClient = DatabaseClient(), + db_client: Optional[DatabaseClient] = None, dev_mode: bool = False ): + if db_client is None: + db_client = DatabaseClient() self.db_client = db_client if not dev_mode: self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) else: self.scheduled_task_manager = None - def get_duplicate_urls_by_batch(self, batch_id: int, page: int = 1) -> GetDuplicatesByBatchResponse: - dup_infos = self.db_client.get_duplicates_by_batch_id(batch_id, page=page) - return GetDuplicatesByBatchResponse(duplicates=dup_infos) - - def get_batch_statuses( - self, - collector_type: Optional[CollectorType], - status: Optional[BatchStatus], - page: int - ) -> GetBatchStatusResponse: - results = self.db_client.get_recent_batch_status_info( - collector_type=collector_type, - status=status, - page=page - ) - return GetBatchStatusResponse(results=results) def get_status(self, batch_id: int) -> BatchStatus: return self.db_client.get_batch_status(batch_id) - def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse: - logs = self.db_client.get_logs_by_batch_id(batch_id) - return GetBatchLogsResponse(logs=logs) def shutdown(self): if self.scheduled_task_manager is not None: diff --git a/core/TaskManager.py b/core/TaskManager.py index 624cb906..7796e80e 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,5 +1,4 @@ import logging -from typing import Optional from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient @@ -9,6 +8,7 @@ from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.FunctionTrigger import FunctionTrigger from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator from core.classes.TaskOperatorBase import TaskOperatorBase from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator @@ -19,10 +19,8 @@ from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier -from pdap_api_client.AccessManager import AccessManager from pdap_api_client.PDAPClient import PDAPClient from util.DiscordNotifier import DiscordPoster -from util.helper_functions import get_from_env TASK_REPEAT_THRESHOLD = 20 @@ -35,12 +33,16 @@ def __init__( url_request_interface: URLRequestInterface, html_parser: HTMLResponseParser, discord_poster: DiscordPoster, + pdap_client: PDAPClient ): + # Dependencies self.adb_client = adb_client + self.pdap_client = pdap_client self.huggingface_interface = huggingface_interface self.url_request_interface = url_request_interface self.html_parser = html_parser self.discord_poster = discord_poster + self.logger = logging.getLogger(__name__) self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.INFO) @@ -73,21 +75,21 @@ async def get_url_record_type_task_operator(self): return operator async def get_agency_identification_task_operator(self): - pdap_client = PDAPClient( - access_manager=AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), - api_key=get_from_env("PDAP_API_KEY"), - ), - ) muckrock_api_interface = MuckrockAPIInterface() operator = AgencyIdentificationTaskOperator( adb_client=self.adb_client, - pdap_client=pdap_client, + pdap_client=self.pdap_client, muckrock_api_interface=muckrock_api_interface ) return operator + async def get_submit_approved_url_task_operator(self): + operator = SubmitApprovedURLTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return operator + async def get_url_miscellaneous_metadata_task_operator(self): operator = URLMiscellaneousMetadataTaskOperator( adb_client=self.adb_client @@ -96,11 +98,12 @@ async def get_url_miscellaneous_metadata_task_operator(self): async def get_task_operators(self) -> list[TaskOperatorBase]: return [ - # await self.get_url_html_task_operator(), + await self.get_url_html_task_operator(), await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), - await self.get_url_miscellaneous_metadata_task_operator() + await self.get_url_miscellaneous_metadata_task_operator(), + await self.get_submit_approved_url_task_operator() ] #endregion diff --git a/core/classes/SubmitApprovedURLTaskOperator.py b/core/classes/SubmitApprovedURLTaskOperator.py index 2a308e7c..81f0b242 100644 --- a/core/classes/SubmitApprovedURLTaskOperator.py +++ b/core/classes/SubmitApprovedURLTaskOperator.py @@ -31,23 +31,35 @@ async def inner_task_logic(self): await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) # Submit each URL, recording errors if they exist - error_infos: list[URLErrorPydanticInfo] = [] - success_tdos: list[SubmitApprovedURLTDO] = [] - for tdo in tdos: - try: - data_source_id = await self.pdap_client.submit_url(tdo) - tdo.data_source_id = data_source_id - success_tdos.append(tdo) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) + submitted_url_infos = await self.pdap_client.submit_urls(tdos) + + error_infos = await self.get_error_infos(submitted_url_infos) + success_infos = await self.get_success_infos(submitted_url_infos) # Update the database for successful submissions - await self.adb_client.mark_urls_as_submitted(tdos=success_tdos) + await self.adb_client.mark_urls_as_submitted(infos=success_infos) # Update the database for failed submissions await self.adb_client.add_url_error_infos(error_infos) + + async def get_success_infos(self, submitted_url_infos): + success_infos = [ + response_object for response_object in submitted_url_infos + if response_object.data_source_id is not None + ] + return success_infos + + async def get_error_infos(self, submitted_url_infos): + error_infos: list[URLErrorPydanticInfo] = [] + error_response_objects = [ + response_object for response_object in submitted_url_infos + if response_object.request_error is not None + ] + for error_response_object in error_response_objects: + error_info = URLErrorPydanticInfo( + task_id=self.task_id, + url_id=error_response_object.url_id, + error=error_response_object.request_error, + ) + error_infos.append(error_info) + return error_infos diff --git a/core/enums.py b/core/enums.py index 213db47c..cfccbb92 100644 --- a/core/enums.py +++ b/core/enums.py @@ -7,11 +7,10 @@ class BatchStatus(Enum): ERROR = "error" ABORTED = "aborted" -class LabelStudioTaskStatus(Enum): - PENDING = "pending" - COMPLETED = "completed" - class RecordType(Enum): + """ + All available URL record types + """ ACCIDENT_REPORTS = "Accident Reports" ARREST_RECORDS = "Arrest Records" CALLS_FOR_SERVICE = "Calls for Service" @@ -51,8 +50,19 @@ class RecordType(Enum): class SuggestionType(Enum): + """ + Identifies the specific kind of suggestion made for a URL + """ AUTO_SUGGESTION = "Auto Suggestion" MANUAL_SUGGESTION = "Manual Suggestion" UNKNOWN = "Unknown" NEW_AGENCY = "New Agency" CONFIRMED = "Confirmed" + +class SubmitResponseStatus(Enum): + """ + Response statuses from the /source-collector/data-sources endpoint + """ + SUCCESS = "success" + FAILURE = "FAILURE" + ALREADY_EXISTS = "already_exists" \ No newline at end of file diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py index e306b6e1..165be89d 100644 --- a/html_tag_collector/RootURLCache.py +++ b/html_tag_collector/RootURLCache.py @@ -16,7 +16,9 @@ class RootURLCacheResponseInfo: exception: Optional[Exception] = None class RootURLCache: - def __init__(self, adb_client: AsyncDatabaseClient = AsyncDatabaseClient()): + def __init__(self, adb_client: Optional[AsyncDatabaseClient] = None): + if adb_client is None: + adb_client = AsyncDatabaseClient() self.adb_client = adb_client self.cache = None diff --git a/llm_api_logic/OpenAIRecordClassifier.py b/llm_api_logic/OpenAIRecordClassifier.py index fc20a0e2..cc0829b5 100644 --- a/llm_api_logic/OpenAIRecordClassifier.py +++ b/llm_api_logic/OpenAIRecordClassifier.py @@ -1,17 +1,16 @@ -from typing import Any from openai.types.chat import ParsedChatCompletion +from core.EnvVarManager import EnvVarManager from llm_api_logic.LLMRecordClassifierBase import RecordClassifierBase from llm_api_logic.RecordTypeStructuredOutput import RecordTypeStructuredOutput -from util.helper_functions import get_from_env class OpenAIRecordClassifier(RecordClassifierBase): @property def api_key(self): - return get_from_env("OPENAI_API_KEY") + return EnvVarManager.get().openai_api_key @property def model_name(self): diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py index 1020f365..aadd8451 100644 --- a/pdap_api_client/AccessManager.py +++ b/pdap_api_client/AccessManager.py @@ -4,8 +4,8 @@ import requests from aiohttp import ClientSession +from core.EnvVarManager import EnvVarManager from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo -from util.helper_functions import get_from_env request_methods = { RequestType.POST: ClientSession.post, @@ -23,7 +23,7 @@ def build_url( namespace: Namespaces, subdomains: Optional[list[str]] = None ): - api_url = get_from_env('PDAP_API_URL') + api_url = EnvVarManager.get().pdap_api_url url = f"{api_url}/{namespace.value}" if subdomains is not None: url = f"{url}/{'/'.join(subdomains)}" diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index 37d7e857..93f67839 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -37,6 +37,7 @@ class Namespaces(Enum): MATCH = "match" CHECK = "check" DATA_SOURCES = "data-sources" + SOURCE_COLLECTOR = "source-collector" class RequestType(Enum): diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index 8b1c5e82..24b9d98c 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,6 +1,6 @@ from typing import Optional -from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo from pdap_api_client.AccessManager import build_url, AccessManager from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ RequestType, RequestInfo, MatchAgencyResponse @@ -85,30 +85,59 @@ async def is_url_unique( duplicates=duplicates ) - async def submit_url( + async def submit_urls( self, - tdo: SubmitApprovedURLTDO - ) -> int: - url = build_url( - namespace=Namespaces.DATA_SOURCES, + tdos: list[SubmitApprovedURLTDO] + ) -> list[SubmittedURLInfo]: + """ + Submits URLs to Data Sources App, + modifying tdos in-place with data source id or error + """ + request_url = build_url( + namespace=Namespaces.SOURCE_COLLECTOR, + subdomains=["data-sources"] ) + + # Build url-id dictionary + url_id_dict = {} + for tdo in tdos: + url_id_dict[tdo.url] = tdo.url_id + + data_sources_json = [] + for tdo in tdos: + data_sources_json.append({ + "name": tdo.name, + "description": tdo.description, + "source_url": tdo.url, + "record_type": tdo.record_type.value, + "record_formats": tdo.record_formats, + "data_portal_type": tdo.data_portal_type, + "last_approval_editor": tdo.approving_user_id, + "supplying_entity": tdo.supplying_entity, + "agency_ids": tdo.agency_ids + }) + + headers = await self.access_manager.jwt_header() request_info = RequestInfo( type_=RequestType.POST, - url=url, + url=request_url, headers=headers, json={ - "entry_data": { - "name": tdo.name, - "description": tdo.description, - "source_url": tdo.url, - "record_type_name": tdo.record_type.value, - "record_formats": tdo.record_formats, - "data_portal_type": tdo.data_portal_type, - "supplying_entity": tdo.supplying_entity - }, - "linked_agency_ids": tdo.agency_ids + "data_sources": data_sources_json } ) response_info = await self.access_manager.make_request(request_info) - return response_info.data["id"] + data_sources_response_json = response_info.data["data_sources"] + + results = [] + for data_source in data_sources_response_json: + url = data_source["url"] + response_object = SubmittedURLInfo( + url_id=url_id_dict[url], + data_source_id=data_source["data_source_id"], + request_error=data_source["error"] + ) + results.append(response_object) + + return results diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index 48859adc..940c372e 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -19,7 +19,7 @@ import docker from docker.errors import APIError, NotFound from docker.models.containers import Container -from pydantic import BaseModel, model_validator, AfterValidator +from pydantic import BaseModel, AfterValidator from apply_migrations import apply_migrations from util.helper_functions import get_from_env @@ -193,7 +193,7 @@ def get_image(self, dockerfile_info: DockerfileInfo): def run_container( self, docker_info: DockerInfo, - ): + ) -> Container: print(f"Running container {docker_info.name}") try: container = self.client.containers.get(docker_info.name) @@ -255,24 +255,11 @@ def set_last_run_time(self): with open("local_state/last_run.txt", "w") as f: f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - - -def main(): - docker_manager = DockerManager() - # Ensure docker is running, and start if not - if not is_docker_running(): - start_docker_engine() - - - # Ensure Dockerfile for database is running, and if not, start it - database_docker_info = DockerInfo( +def get_database_docker_info() -> DockerInfo: + return DockerInfo( dockerfile_info=DockerfileInfo( image_tag="postgres:15", ), - # volume_info=VolumeInfo( - # host_path="dbscripts", - # container_path="/var/lib/postgresql/data" - # ), name="data_source_identification_db", ports={ "5432/tcp": 5432 @@ -290,12 +277,9 @@ def main(): start_period=2 ) ) - container = docker_manager.run_container(database_docker_info) - wait_for_pg_to_be_ready(container) - - # Start dockerfile for Datadumper - data_dumper_docker_info = DockerInfo( +def get_data_dumper_docker_info() -> DockerInfo: + return DockerInfo( dockerfile_info=DockerfileInfo( image_tag="datadumper", dockerfile_directory="local_database/DataDumper" @@ -320,6 +304,21 @@ def main(): command="bash" ) +def main(): + docker_manager = DockerManager() + # Ensure docker is running, and start if not + if not is_docker_running(): + start_docker_engine() + + # Ensure Dockerfile for database is running, and if not, start it + database_docker_info = get_database_docker_info() + container = docker_manager.run_container(database_docker_info) + wait_for_pg_to_be_ready(container) + + + # Start dockerfile for Datadumper + data_dumper_docker_info = get_data_dumper_docker_info() + # If not last run within 24 hours, run dump operation in Datadumper # Check cache if exists and checker = TimestampChecker() @@ -343,11 +342,20 @@ def main(): apply_migrations() # Run `fastapi dev main.py` - uvicorn.run( - "api.main:app", - host="0.0.0.0", - port=8000 - ) + try: + uvicorn.run( + "api.main:app", + host="0.0.0.0", + port=8000 + ) + finally: + # Add feature to stop all running containers + print("Stopping containers...") + for container in docker_manager.client.containers.list(): + container.stop() + + print("Containers stopped.") + diff --git a/tests/conftest.py b/tests/conftest.py index 8aeb6dc6..d7b1bce7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ import pytest -from alembic import command from alembic.config import Config from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker @@ -8,12 +7,42 @@ from collector_db.DatabaseClient import DatabaseClient from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base +from core.EnvVarManager import EnvVarManager from tests.helpers.AlembicRunner import AlembicRunner from tests.helpers.DBDataCreator import DBDataCreator +from util.helper_functions import load_from_environment @pytest.fixture(autouse=True, scope="session") def setup_and_teardown(): + # Set up environment variables that must be defined + # outside of tests + required_env_vars: dict = load_from_environment( + keys=[ + "POSTGRES_USER", + "POSTGRES_PASSWORD", + "POSTGRES_HOST", + "POSTGRES_PORT", + "POSTGRES_DB", + ] + ) + # Add test environment variables + test_env_vars = [ + "GOOGLE_API_KEY", + "GOOGLE_CSE_ID", + "PDAP_EMAIL", + "PDAP_PASSWORD", + "PDAP_API_KEY", + "PDAP_API_URL", + "DISCORD_WEBHOOK_URL", + "OPENAI_API_KEY", + ] + all_env_vars = required_env_vars.copy() + for env_var in test_env_vars: + all_env_vars[env_var] = "TEST" + + EnvVarManager.override(all_env_vars) + conn = get_postgres_connection_string() engine = create_engine(conn) alembic_cfg = Config("alembic.ini") diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 60b873c4..613bfe4d 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -23,13 +23,17 @@ class BatchURLCreationInfo(BaseModel): batch_id: int url_ids: list[int] + urls: list[str] class DBDataCreator: """ Assists in the creation of test data """ - def __init__(self, db_client: DatabaseClient = DatabaseClient()): - self.db_client = db_client + def __init__(self, db_client: Optional[DatabaseClient] = None): + if db_client is not None: + self.db_client = db_client + else: + self.db_client = DatabaseClient() self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() def batch(self, strategy: CollectorType = CollectorType.EXAMPLE) -> int: @@ -63,7 +67,8 @@ async def batch_and_urls( return BatchURLCreationInfo( batch_id=batch_id, - url_ids=url_ids + url_ids=url_ids, + urls=[iui.url for iui in iuis.url_mappings] ) async def agency(self) -> int: @@ -189,6 +194,7 @@ def urls( URLInfo( url=url, outcome=outcome, + name="Test Name" if outcome == URLStatus.VALIDATED else None, collector_metadata=collector_metadata ) ) diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index b466bfbb..ae34b28e 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -1,9 +1,6 @@ -import asyncio -import logging -import os from dataclasses import dataclass from typing import Generator -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import MagicMock, AsyncMock, patch import pytest import pytest_asyncio @@ -11,7 +8,6 @@ from api.main import app from core.AsyncCore import AsyncCore -from core.AsyncCoreLogger import AsyncCoreLogger from core.SourceCollectorCore import SourceCollectorCore from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions from tests.helpers.DBDataCreator import DBDataCreator @@ -48,9 +44,7 @@ def override_access_info() -> AccessInfo: @pytest.fixture(scope="session") def client() -> Generator[TestClient, None, None]: - # Mock envioronment - _original_env = dict(os.environ) - os.environ["DISCORD_WEBHOOK_URL"] = "https://discord.com" + # Mock environment with TestClient(app) as c: app.dependency_overrides[get_access_info] = override_access_info async_core: AsyncCore = c.app.state.async_core @@ -67,8 +61,6 @@ def client() -> Generator[TestClient, None, None]: yield c # Reset environment variables back to original state - os.environ.clear() - os.environ.update(_original_env) @pytest_asyncio.fixture diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 0e462ba5..0501ac1f 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -1,15 +1,12 @@ -from typing import Any import pytest from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource from collector_db.models import UserUrlAgencySuggestion, UserRelevantSuggestion, UserRecordTypeSuggestion from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo -from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import RecordType, SuggestionType diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index c42b894d..a5c77b29 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -1,5 +1,4 @@ import time -from unittest.mock import AsyncMock from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO @@ -30,7 +29,7 @@ def test_duplicates(api_test_helper): assert batch_id_2 is not None - time.sleep(2) + time.sleep(1.5) bi_1: BatchInfo = ath.request_validator.get_batch_info(batch_id_1) bi_2: BatchInfo = ath.request_validator.get_batch_info(batch_id_2) diff --git a/tests/test_automated/integration/collector_db/test_database_structure.py b/tests/test_automated/integration/collector_db/test_database_structure.py index 2b2fcbca..6d82631c 100644 --- a/tests/test_automated/integration/collector_db/test_database_structure.py +++ b/tests/test_automated/integration/collector_db/test_database_structure.py @@ -52,9 +52,11 @@ def __init__( self, columns: list[ColumnTester], table_name: str, - engine: sa.Engine = create_engine(get_postgres_connection_string()), + engine: Optional[sa.Engine] = None, constraints: Optional[list[ConstraintTester]] = None, ): + if engine is None: + engine = create_engine(get_postgres_connection_string(is_async=True)) self.columns = columns self.table_name = table_name self.constraints = constraints @@ -228,6 +230,11 @@ def test_url(db_data_creator: DBDataCreator): column_name="outcome", type_=postgresql.ENUM, allowed_values=get_enum_values(URLStatus) + ), + ColumnTester( + column_name="name", + type_=sa.String, + allowed_values=['test'], ) ], engine=db_data_creator.db_client.engine diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index c78bf57e..7b98728f 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -60,11 +60,12 @@ async def test_insert_urls( assert insert_urls_info.original_count == 2 assert insert_urls_info.duplicate_count == 1 - -def test_insert_logs(db_data_creator: DBDataCreator): +@pytest.mark.asyncio +async def test_insert_logs(db_data_creator: DBDataCreator): batch_id_1 = db_data_creator.batch() batch_id_2 = db_data_creator.batch() + adb_client = db_data_creator.adb_client db_client = db_data_creator.db_client db_client.insert_logs( log_infos=[ @@ -74,26 +75,28 @@ def test_insert_logs(db_data_creator: DBDataCreator): ] ) - logs = db_client.get_logs_by_batch_id(batch_id_1) + logs = await adb_client.get_logs_by_batch_id(batch_id_1) assert len(logs) == 2 - logs = db_client.get_logs_by_batch_id(batch_id_2) + logs = await adb_client.get_logs_by_batch_id(batch_id_2) assert len(logs) == 1 -def test_delete_old_logs(db_data_creator: DBDataCreator): +@pytest.mark.asyncio +async def test_delete_old_logs(db_data_creator: DBDataCreator): batch_id = db_data_creator.batch() old_datetime = datetime.now() - timedelta(days=1) db_client = db_data_creator.db_client + adb_client = db_data_creator.adb_client log_infos = [] for i in range(3): log_infos.append(LogInfo(log="test log", batch_id=batch_id, created_at=old_datetime)) db_client.insert_logs(log_infos=log_infos) - logs = db_client.get_logs_by_batch_id(batch_id=batch_id) + logs = await adb_client.get_logs_by_batch_id(batch_id=batch_id) assert len(logs) == 3 db_client.delete_old_logs() - logs = db_client.get_logs_by_batch_id(batch_id=batch_id) + logs = await adb_client.get_logs_by_batch_id(batch_id=batch_id) assert len(logs) == 0 def test_delete_url_updated_at(db_data_creator: DBDataCreator): diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index b4b8e740..ed314dfd 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -21,6 +21,7 @@ def setup_async_core(adb_client: AsyncDatabaseClient): url_request_interface=AsyncMock(), html_parser=AsyncMock(), discord_poster=AsyncMock(), + pdap_client=AsyncMock() ), collector_manager=AsyncMock() ) diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index 75630af8..04256de9 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -3,39 +3,54 @@ import pytest -from collector_db.models import URL +from collector_db.enums import TaskType +from collector_db.models import URL, URLErrorInfo from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator -from core.enums import RecordType +from core.enums import RecordType, SubmitResponseStatus from helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator from pdap_api_client.AccessManager import AccessManager from pdap_api_client.DTOs import RequestInfo, RequestType, ResponseInfo from pdap_api_client.PDAPClient import PDAPClient +def mock_make_request(pdap_client: PDAPClient, urls: list[str]): + assert len(urls) == 3, "Expected 3 urls" + pdap_client.access_manager.make_request = AsyncMock( + return_value=ResponseInfo( + status_code=HTTPStatus.OK, + data={ + "data_sources": [ + { + "url": urls[0], + "status": SubmitResponseStatus.SUCCESS, + "error": None, + "data_source_id": 21, + }, + { + "url": urls[1], + "status": SubmitResponseStatus.SUCCESS, + "error": None, + "data_source_id": 34, + }, + { + "url": urls[2], + "status": SubmitResponseStatus.FAILURE, + "error": "Test Error", + "data_source_id": None + } + ] + } + ) + ) + @pytest.fixture -def mock_pdap_client(): +def mock_pdap_client() -> PDAPClient: mock_access_manager = MagicMock( spec=AccessManager ) - mock_access_manager.make_request = AsyncMock( - side_effect=[ - ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "id": 21 - } - ), - ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "id": 34 - } - ) - ] - ) mock_access_manager.jwt_header = AsyncMock( return_value={"Authorization": "Bearer token"} ) @@ -44,13 +59,15 @@ def mock_pdap_client(): ) return pdap_client -async def setup_validated_urls(db_data_creator: DBDataCreator): +async def setup_validated_urls(db_data_creator: DBDataCreator) -> list[str]: creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls( - url_count=2, + url_count=3, with_html_content=True ) + url_1 = creation_info.url_ids[0] url_2 = creation_info.url_ids[1] + url_3 = creation_info.url_ids[2] await db_data_creator.adb_client.approve_url( approval_info=FinalReviewApprovalInfo( url_id=url_1, @@ -72,16 +89,31 @@ async def setup_validated_urls(db_data_creator: DBDataCreator): name="URL 2 Name", description="URL 2 Description", ), - user_id=1 + user_id=2 + ) + await db_data_creator.adb_client.approve_url( + approval_info=FinalReviewApprovalInfo( + url_id=url_3, + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=[5, 6], + name="URL 3 Name", + description="URL 3 Description", + ), + user_id=3 ) + return creation_info.urls @pytest.mark.asyncio async def test_submit_approved_url_task( db_data_creator, - mock_pdap_client, + mock_pdap_client: PDAPClient, monkeypatch ): - monkeypatch.setenv("PDAP_API_URL", "http://localhost:8000") + """ + The submit_approved_url_task should submit + all validated URLs to the PDAP Data Sources App + """ + # Get Task Operator operator = SubmitApprovedURLTaskOperator( @@ -94,13 +126,17 @@ async def test_submit_approved_url_task( # Create URLs with status 'validated' in database and all requisite URL values # Ensure they have optional metadata as well - await setup_validated_urls(db_data_creator) + urls = await setup_validated_urls(db_data_creator) + mock_make_request(mock_pdap_client, urls) # Check Task Operator does meet pre-requisites assert await operator.meets_task_prerequisites() # Run Task - run_info = await operator.run_task(task_id=1) + task_id = await db_data_creator.adb_client.initiate_task( + task_type=TaskType.SUBMIT_APPROVED + ) + run_info = await operator.run_task(task_id=task_id) # Check Task has been marked as completed assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message @@ -109,63 +145,73 @@ async def test_submit_approved_url_task( urls = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") url_1 = urls[0] url_2 = urls[1] + url_3 = urls[2] # Check URLs have been marked as 'submitted' assert url_1.outcome == URLStatus.SUBMITTED.value assert url_2.outcome == URLStatus.SUBMITTED.value + assert url_3.outcome == URLStatus.ERROR.value # Check URLs now have data source ids assert url_1.data_source_id == 21 assert url_2.data_source_id == 34 + assert url_3.data_source_id is None - # Check mock method was called twice with expected parameters - access_manager = mock_pdap_client.access_manager - assert access_manager.make_request.call_count == 2 - # Check first call + # Check that errored URL has entry in url_error_info + url_errors = await db_data_creator.adb_client.get_all(URLErrorInfo) + assert len(url_errors) == 1 + url_error = url_errors[0] + assert url_error.url_id == url_3.id + assert url_error.error == "Test Error" + # Check mock method was called expected parameters + access_manager = mock_pdap_client.access_manager + access_manager.make_request.assert_called_once() call_1 = access_manager.make_request.call_args_list[0][0][0] expected_call_1 = RequestInfo( type_=RequestType.POST, - url="http://localhost:8000/data-sources", + url="TEST/source-collector/data-sources", headers=access_manager.jwt_header.return_value, json={ - "entry_data": { - "name": "URL 1 Name", - "source_url": url_1.url, - "record_type_name": "Accident Reports", - "description": "URL 1 Description", - "record_formats": ["Record Format 1", "Record Format 2"], - "data_portal_type": "Data Portal Type 1", - "supplying_entity": "Supplying Entity 1" - }, - "linked_agency_ids": [1, 2] + "data_sources": [ + { + "name": "URL 1 Name", + "source_url": url_1.url, + "record_type": "Accident Reports", + "description": "URL 1 Description", + "record_formats": ["Record Format 1", "Record Format 2"], + "data_portal_type": "Data Portal Type 1", + "last_approval_editor": 1, + "supplying_entity": "Supplying Entity 1", + "agency_ids": [1, 2] + }, + { + "name": "URL 2 Name", + "source_url": url_2.url, + "record_type": "Incarceration Records", + "description": "URL 2 Description", + "last_approval_editor": 2, + "supplying_entity": None, + "record_formats": None, + "data_portal_type": None, + "agency_ids": [3, 4] + }, + { + "name": "URL 3 Name", + "source_url": url_3.url, + "record_type": "Accident Reports", + "description": "URL 3 Description", + "last_approval_editor": 3, + "supplying_entity": None, + "record_formats": None, + "data_portal_type": None, + "agency_ids": [5, 6] + } + ] } ) assert call_1.type_ == expected_call_1.type_ assert call_1.url == expected_call_1.url assert call_1.headers == expected_call_1.headers assert call_1.json == expected_call_1.json - # Check second call - call_2 = access_manager.make_request.call_args_list[1][0][0] - expected_call_2 = RequestInfo( - type_=RequestType.POST, - url="http://localhost:8000/data-sources", - headers=access_manager.jwt_header.return_value, - json={ - "entry_data": { - "name": "URL 2 Name", - "source_url": url_2.url, - "record_type_name": "Incarceration Records", - "description": "URL 2 Description", - "data_portal_type": None, - "supplying_entity": None, - "record_formats": None - }, - "linked_agency_ids": [3, 4] - } - ) - assert call_2.type_ == expected_call_2.type_ - assert call_2.url == expected_call_2.url - assert call_2.headers == expected_call_2.headers - assert call_2.json == expected_call_2.json \ No newline at end of file diff --git a/util/DiscordNotifier.py b/util/DiscordNotifier.py index 15e74020..6df1aa90 100644 --- a/util/DiscordNotifier.py +++ b/util/DiscordNotifier.py @@ -10,4 +10,10 @@ def __init__(self, webhook_url: str): raise ValueError("WEBHOOK_URL environment variable not set") self.webhook_url = webhook_url def post_to_discord(self, message): - requests.post(self.webhook_url, json={"content": message}) + try: + requests.post(self.webhook_url, json={"content": message}) + except Exception as e: + logging.error( + f"Error posting message to Discord: {e}." + f"\n\nMessage: {message}" + ) diff --git a/util/helper_functions.py b/util/helper_functions.py index bf72d39b..7d6c7f8d 100644 --- a/util/helper_functions.py +++ b/util/helper_functions.py @@ -16,6 +16,19 @@ def get_from_env(key: str, allow_none: bool = False): raise ValueError(f"Environment variable {key} is not set") return val +def load_from_environment(keys: list[str]) -> dict[str, str]: + """ + Load selected keys from environment, returning a dictionary + """ + original_environment = os.environ.copy() + try: + load_dotenv() + return {key: os.getenv(key) for key in keys} + finally: + # Restore the original environment + os.environ.clear() + os.environ.update(original_environment) + def base_model_list_dump(model_list: list[BaseModel]) -> list[dict]: return [model.model_dump() for model in model_list] From beae1b9885b0e2e94dfefad703bf4c02fb9fce3b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 15 Apr 2025 16:53:48 -0400 Subject: [PATCH 109/182] fix(tests): fix import bug --- .../integration/tasks/test_submit_approved_url_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index 04256de9..b15ff9d5 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -10,7 +10,7 @@ from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator from core.enums import RecordType, SubmitResponseStatus -from helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator +from tests.helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator from pdap_api_client.AccessManager import AccessManager from pdap_api_client.DTOs import RequestInfo, RequestType, ResponseInfo from pdap_api_client.PDAPClient import PDAPClient From d134194cefcd9616677ca3199c0527167c728e39 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 16 Apr 2025 20:10:24 -0400 Subject: [PATCH 110/182] feat(database): allow one user annotation per url Previously, multiple users could annotate a single URL. Now, only one user can annotate a URL -- after that URL has been annotated, no other users can annotate it. --- ...72_set_user_annotation_tables_to_allow_.py | 61 ++++++ collector_db/AsyncDatabaseClient.py | 9 +- core/DTOs/GetNextURLForFinalReviewResponse.py | 2 + core/TaskManager.py | 14 +- .../AgencyIdentificationTaskOperator.py | 2 +- .../SubmitApprovedURLTaskOperator.py | 2 +- .../{ => task_operators}/TaskOperatorBase.py | 0 .../URLHTMLTaskOperator.py | 2 +- .../URLMiscellaneousMetadataTaskOperator.py | 2 +- .../URLRecordTypeTaskOperator.py | 2 +- .../URLRelevanceHuggingfaceTaskOperator.py | 5 +- core/classes/task_operators/__init__.py | 0 .../test_html_tag_collector_integration.py | 2 +- .../integration/api/test_annotate.py | 20 +- .../collector_db/test_db_client.py | 175 ++++++++++++++++-- .../tasks/test_agency_preannotation_task.py | 3 +- .../integration/tasks/test_example_task.py | 3 +- .../tasks/test_submit_approved_url_task.py | 2 +- .../integration/tasks/test_url_html_task.py | 2 +- .../test_url_miscellaneous_metadata_task.py | 2 +- .../tasks/test_url_record_type_task.py | 4 +- .../test_url_relevancy_huggingface_task.py | 3 +- 22 files changed, 266 insertions(+), 51 deletions(-) create mode 100644 alembic/versions/2025_04_16_1954-997f5bf53772_set_user_annotation_tables_to_allow_.py rename core/classes/{ => task_operators}/AgencyIdentificationTaskOperator.py (98%) rename core/classes/{ => task_operators}/SubmitApprovedURLTaskOperator.py (97%) rename core/classes/{ => task_operators}/TaskOperatorBase.py (100%) rename core/classes/{ => task_operators}/URLHTMLTaskOperator.py (98%) rename core/classes/{ => task_operators}/URLMiscellaneousMetadataTaskOperator.py (97%) rename core/classes/{ => task_operators}/URLRecordTypeTaskOperator.py (97%) rename core/classes/{ => task_operators}/URLRelevanceHuggingfaceTaskOperator.py (91%) create mode 100644 core/classes/task_operators/__init__.py diff --git a/alembic/versions/2025_04_16_1954-997f5bf53772_set_user_annotation_tables_to_allow_.py b/alembic/versions/2025_04_16_1954-997f5bf53772_set_user_annotation_tables_to_allow_.py new file mode 100644 index 00000000..775caddf --- /dev/null +++ b/alembic/versions/2025_04_16_1954-997f5bf53772_set_user_annotation_tables_to_allow_.py @@ -0,0 +1,61 @@ +"""Set user annotation tables to allow only one annotation per url + +Revision ID: 997f5bf53772 +Revises: ed06a5633d2e +Create Date: 2025-04-16 19:54:59.798580 + +""" +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = '997f5bf53772' +down_revision: Union[str, None] = 'ed06a5633d2e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Delete entries with more than one annotation + # Relevance + op.execute(""" + with ranked as( + SELECT + id, + ROW_NUMBER() OVER (PARTITION BY URL_ID ORDER BY id) as rn + FROM + USER_RELEVANT_SUGGESTIONS + ) + DELETE FROM user_relevant_suggestions + USING ranked + WHERE USER_RELEVANT_SUGGESTIONS.id = ranked.id + and ranked.rn > 1 + """) + # Record Type + op.execute(""" + with ranked as( + SELECT + id, + ROW_NUMBER() OVER (PARTITION BY URL_ID ORDER BY id) as rn + FROM + USER_RECORD_TYPE_SUGGESTIONS + ) + DELETE FROM user_record_type_suggestions + USING ranked + WHERE USER_RECORD_TYPE_SUGGESTIONS.id = ranked.id + and ranked.rn > 1 + """) + + # Add unique constraint to url_id column + op.create_unique_constraint('uq_user_relevant_suggestions_url_id', 'user_relevant_suggestions', ['url_id']) + op.create_unique_constraint('uq_user_record_type_suggestions_url_id', 'user_record_type_suggestions', ['url_id']) + op.create_unique_constraint('uq_user_agency_suggestions_url_id', 'user_url_agency_suggestions', ['url_id']) + + + +def downgrade() -> None: + op.drop_constraint('uq_user_relevant_suggestions_url_id', 'user_relevant_suggestions', type_='unique') + op.drop_constraint('uq_user_record_type_suggestions_url_id', 'user_record_type_suggestions', type_='unique') + op.drop_constraint('uq_user_agency_suggestions_url_id', 'user_url_agency_suggestions', type_='unique') \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 98410b6f..c8b4a204 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -137,14 +137,13 @@ async def get_next_url_for_user_annotation( URL, ) .where(URL.outcome == URLStatus.PENDING.value) - # URL must not have metadata annotation by this user + # URL must not have user suggestion .where( not_( exists( select(user_suggestion_model_to_exclude) .where( user_suggestion_model_to_exclude.url_id == URL.id, - user_suggestion_model_to_exclude.user_id == user_id ) ) ) @@ -158,7 +157,6 @@ async def get_next_url_for_user_annotation( select(UserRelevantSuggestion) .where( UserRelevantSuggestion.url_id == URL.id, - UserRelevantSuggestion.user_id == user_id, UserRelevantSuggestion.relevant == False ) ) @@ -833,15 +831,14 @@ async def get_next_url_agency_for_annotation( if batch_id is not None: statement = statement.where(URL.batch_id == batch_id) - # Must not have been annotated by this user + # Must not have been annotated by a user statement = ( statement.join(UserUrlAgencySuggestion, isouter=True) .where( ~exists( select(UserUrlAgencySuggestion). where( - (UserUrlAgencySuggestion.user_id == user_id) & - (UserUrlAgencySuggestion.url_id == URL.id) + UserUrlAgencySuggestion.url_id == URL.id ). correlate(URL) ) diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index 422c38ab..f7f44e32 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -23,6 +23,7 @@ class FinalReviewAnnotationRecordTypeInfo(BaseModel): title="A dictionary, sorted by size and omitting zero values, of all record types suggested by users", ) +# region Agency class FinalReviewAnnotationAgencyUserInfo(GetNextURLForAgencyAgencyInfo): count: int = Field(title="Number of times suggested by users") @@ -41,6 +42,7 @@ class FinalReviewAnnotationAgencyInfo(BaseModel): users: Optional[dict[int, FinalReviewAnnotationAgencyUserInfo]] = Field( title="A list, sorted by size, of all agencies suggested by users", ) +# endregion class FinalReviewAnnotationInfo(BaseModel): relevant: FinalReviewAnnotationRelevantInfo = Field( diff --git a/core/TaskManager.py b/core/TaskManager.py index 7796e80e..8a40b129 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -7,13 +7,13 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome from core.FunctionTrigger import FunctionTrigger -from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator -from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator -from core.classes.TaskOperatorBase import TaskOperatorBase -from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator -from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator -from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator -from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from core.classes.task_operators.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.task_operators.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.task_operators.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator +from core.classes.task_operators.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator +from core.classes.task_operators.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from core.enums import BatchStatus from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface diff --git a/core/classes/AgencyIdentificationTaskOperator.py b/core/classes/task_operators/AgencyIdentificationTaskOperator.py similarity index 98% rename from core/classes/AgencyIdentificationTaskOperator.py rename to core/classes/task_operators/AgencyIdentificationTaskOperator.py index 1589b96f..4c2d6f1b 100644 --- a/core/classes/AgencyIdentificationTaskOperator.py +++ b/core/classes/task_operators/AgencyIdentificationTaskOperator.py @@ -7,7 +7,7 @@ from collector_manager.enums import CollectorType from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask from core.classes.subtasks.CKANAgencyIdentificationSubtask import CKANAgencyIdentificationSubtask from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask diff --git a/core/classes/SubmitApprovedURLTaskOperator.py b/core/classes/task_operators/SubmitApprovedURLTaskOperator.py similarity index 97% rename from core/classes/SubmitApprovedURLTaskOperator.py rename to core/classes/task_operators/SubmitApprovedURLTaskOperator.py index 81f0b242..86e0229e 100644 --- a/core/classes/SubmitApprovedURLTaskOperator.py +++ b/core/classes/task_operators/SubmitApprovedURLTaskOperator.py @@ -2,7 +2,7 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from pdap_api_client.PDAPClient import PDAPClient diff --git a/core/classes/TaskOperatorBase.py b/core/classes/task_operators/TaskOperatorBase.py similarity index 100% rename from core/classes/TaskOperatorBase.py rename to core/classes/task_operators/TaskOperatorBase.py diff --git a/core/classes/URLHTMLTaskOperator.py b/core/classes/task_operators/URLHTMLTaskOperator.py similarity index 98% rename from core/classes/URLHTMLTaskOperator.py rename to core/classes/task_operators/URLHTMLTaskOperator.py index ad279f9d..f6cfa28a 100644 --- a/core/classes/URLHTMLTaskOperator.py +++ b/core/classes/task_operators/URLHTMLTaskOperator.py @@ -4,7 +4,7 @@ from collector_db.enums import TaskType from core.DTOs.task_data_objects.UrlHtmlTDO import UrlHtmlTDO from core.classes.HTMLContentInfoGetter import HTMLContentInfoGetter -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.URLRequestInterface import URLRequestInterface diff --git a/core/classes/URLMiscellaneousMetadataTaskOperator.py b/core/classes/task_operators/URLMiscellaneousMetadataTaskOperator.py similarity index 97% rename from core/classes/URLMiscellaneousMetadataTaskOperator.py rename to core/classes/task_operators/URLMiscellaneousMetadataTaskOperator.py index 1cbebbc6..68a3a243 100644 --- a/core/classes/URLMiscellaneousMetadataTaskOperator.py +++ b/core/classes/task_operators/URLMiscellaneousMetadataTaskOperator.py @@ -5,7 +5,7 @@ from collector_db.enums import TaskType from collector_manager.enums import CollectorType from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from core.classes.subtasks.MiscellaneousMetadata.AutoGooglerMiscMetadataSubtask import AutoGooglerMiscMetadataSubtask from core.classes.subtasks.MiscellaneousMetadata.CKANMiscMetadataSubtask import CKANMiscMetadataSubtask from core.classes.subtasks.MiscellaneousMetadata.MiscellaneousMetadataSubtaskBase import \ diff --git a/core/classes/URLRecordTypeTaskOperator.py b/core/classes/task_operators/URLRecordTypeTaskOperator.py similarity index 97% rename from core/classes/URLRecordTypeTaskOperator.py rename to core/classes/task_operators/URLRecordTypeTaskOperator.py index 3f94811f..ab1f1f08 100644 --- a/core/classes/URLRecordTypeTaskOperator.py +++ b/core/classes/task_operators/URLRecordTypeTaskOperator.py @@ -2,7 +2,7 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType from core.DTOs.task_data_objects.URLRecordTypeTDO import URLRecordTypeTDO -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from core.enums import RecordType from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier diff --git a/core/classes/URLRelevanceHuggingfaceTaskOperator.py b/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py similarity index 91% rename from core/classes/URLRelevanceHuggingfaceTaskOperator.py rename to core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py index e6ebdc3d..4871a9f0 100644 --- a/core/classes/URLRelevanceHuggingfaceTaskOperator.py +++ b/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py @@ -1,9 +1,8 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient -from collector_db.DTOs.URLMetadataInfo import URLMetadataInfo from collector_db.DTOs.URLWithHTML import URLWithHTML -from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType +from collector_db.enums import TaskType from core.DTOs.task_data_objects.URLRelevanceHuggingfaceTDO import URLRelevanceHuggingfaceTDO -from core.classes.TaskOperatorBase import TaskOperatorBase +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from hugging_face.HuggingFaceInterface import HuggingFaceInterface diff --git a/core/classes/task_operators/__init__.py b/core/classes/task_operators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 8f1fc630..3ffef203 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -2,7 +2,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo -from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator from helpers.DBDataCreator import DBDataCreator from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 0501ac1f..d5b6dade 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -68,6 +68,9 @@ async def test_annotate_relevancy(api_test_helper): # Validate that the correct relevant value is returned assert inner_info_1.suggested_relevant is True + # A second user should see the same URL + + # Annotate with value 'False' and get next URL request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( url_id=inner_info_1.url_info.url_id, @@ -106,7 +109,6 @@ async def test_annotate_relevancy(api_test_helper): assert result_2.relevant is True # If user submits annotation for same URL, the URL should be overwritten - request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( url_id=inner_info_1.url_info.url_id, relevance_annotation_post_info=RelevanceAnnotationPostInfo( @@ -420,12 +422,6 @@ async def test_annotate_agency_other_user_annotation(api_test_helper): ) url_ids = setup_info.url_ids - - await ath.db_data_creator.manual_suggestion( - user_id=MOCK_USER_ID + 1, - url_id=url_ids[0], - ) - response = await ath.request_validator.get_next_agency_annotation() assert response.next_annotation @@ -440,6 +436,16 @@ async def test_annotate_agency_other_user_annotation(api_test_helper): # Check that one agency_suggestion exists assert len(next_annotation.agency_suggestions) == 1 + # Test that another user can insert a suggestion + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=url_ids[0], + ) + + # After this, text that our user does not receive this URL + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None + @pytest.mark.asyncio async def test_annotate_agency_submit_and_get_next(api_test_helper): """ diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 7b98728f..0de7e16c 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -167,7 +167,7 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato ) url_mapping = setup_info.url_mapping - + # Add agency auto suggestions await db_data_creator.agency_auto_suggestions( url_id=url_mapping.url_id, count=3 @@ -478,6 +478,11 @@ async def test_approval_url_error(db_data_creator: DBDataCreator): async def test_get_next_url_for_user_relevance_annotation_pending( db_data_creator: DBDataCreator ): + """ + Users should receive a valid URL to annotate + All users should receive the same next URL + Once any user annotates that URL, none of the users should receive it again + """ setup_info = await setup_for_get_next_url_for_annotation( db_data_creator=db_data_creator, url_count=2 @@ -492,11 +497,45 @@ async def test_get_next_url_for_user_relevance_annotation_pending( ) adb_client = db_data_creator.adb_client - url = await adb_client.get_next_url_for_relevance_annotation( + url_1 = await adb_client.get_next_url_for_relevance_annotation( user_id=1, batch_id=None ) - assert url is not None + assert url_1 is not None + + url_2 = await adb_client.get_next_url_for_relevance_annotation( + user_id=2, + batch_id=None + ) + assert url_2 is not None + + assert url_1.url_info.url == url_2.url_info.url + + # Annotate this URL, then check that the second URL is returned + await adb_client.add_user_relevant_suggestion( + url_id=url_1.url_info.url_id, + user_id=1, + relevant=True + ) + + url_3 = await adb_client.get_next_url_for_relevance_annotation( + user_id=1, + batch_id=None + ) + assert url_3 is not None + + assert url_1 != url_3 + + # Check that the second URL is also returned for another user + url_4 = await adb_client.get_next_url_for_relevance_annotation( + user_id=2, + batch_id=None + ) + assert url_4 is not None + + + assert url_4 == url_3 + @pytest.mark.asyncio async def test_get_next_url_for_annotation_batch_filtering( @@ -643,26 +682,27 @@ async def test_annotate_url_marked_not_relevant(db_data_creator: DBDataCreator): ) assert record_type_annotation_info.url_info.url_id != url_to_mark_not_relevant.url_id - # Other users should still receive the URL for record type annotation + # Other users also should not receive the URL for record type annotation record_type_annotation_info = await adb_client.get_next_url_for_record_type_annotation( user_id=2, batch_id=None ) - assert record_type_annotation_info.url_info.url_id == url_to_mark_not_relevant.url_id + assert record_type_annotation_info.url_info.url_id != \ + url_to_mark_not_relevant.url_id, "Other users should not receive the URL for record type annotation" # User should not receive the URL for agency annotation - agency_annotation_info = await adb_client.get_next_url_agency_for_annotation( + agency_annotation_info_user_1 = await adb_client.get_next_url_agency_for_annotation( user_id=1, batch_id=None ) - assert agency_annotation_info.next_annotation.url_id != url_to_mark_not_relevant.url_id + assert agency_annotation_info_user_1.next_annotation.url_id != url_to_mark_not_relevant.url_id - # Other users should still receive the URL for agency annotation - agency_annotation_info = await adb_client.get_next_url_agency_for_annotation( + # Other users also should not receive the URL for agency annotation + agency_annotation_info_user_2 = await adb_client.get_next_url_agency_for_annotation( user_id=2, batch_id=None ) - assert agency_annotation_info.next_annotation.url_id == url_to_mark_not_relevant.url_id + assert agency_annotation_info_user_1.next_annotation.url_id != url_to_mark_not_relevant.url_id @pytest.mark.asyncio async def test_annotate_url_agency_agency_not_in_db(db_data_creator: DBDataCreator): @@ -681,4 +721,117 @@ async def test_annotate_url_agency_agency_not_in_db(db_data_creator: DBDataCreat agencies = await db_data_creator.adb_client.get_all(Agency) assert len(agencies) - assert agencies[0].name == PLACEHOLDER_AGENCY_NAME \ No newline at end of file + assert agencies[0].name == PLACEHOLDER_AGENCY_NAME + +@pytest.mark.asyncio +async def test_get_next_url_for_user_record_type_annotation(db_data_creator: DBDataCreator): + """ + All users should receive the same next valid URL for record type annotation + Once any user annotates that URL, none of the users should receive it + """ + setup_info = await setup_for_get_next_url_for_annotation( + db_data_creator, + url_count=2 + ) + + # All users should receive the same URL + url_1 = setup_info.insert_urls_info.url_mappings[0] + url_2 = setup_info.insert_urls_info.url_mappings[1] + + adb_client = db_data_creator.adb_client + + url_user_1 = await adb_client.get_next_url_for_record_type_annotation( + user_id=1, + batch_id=None + ) + assert url_user_1 is not None + + url_user_2 = await adb_client.get_next_url_for_record_type_annotation( + user_id=2, + batch_id=None + ) + + assert url_user_2 is not None + + # Check that the URLs are the same + assert url_user_1 == url_user_2 + + # After annotating, both users should receive a different URL + await adb_client.add_user_record_type_suggestion( + user_id=1, + url_id=url_1.url_id, + record_type=RecordType.ARREST_RECORDS + ) + + next_url_user_1 = await adb_client.get_next_url_for_record_type_annotation( + user_id=1, + batch_id=None + ) + + next_url_user_2 = await adb_client.get_next_url_for_record_type_annotation( + user_id=2, + batch_id=None + ) + + assert next_url_user_1 != url_user_1 + assert next_url_user_1 == next_url_user_2 + + + + + +@pytest.mark.asyncio +async def test_get_next_url_for_user_agency_annotation(db_data_creator: DBDataCreator): + """ + All users should receive the same next valid URL for agency annotation + Once any user annotates that URL, none of the users should receive it + """ + setup_info = await setup_for_annotate_agency( + db_data_creator, + url_count=2 + ) + + # All users should receive the same URL + url_1 = setup_info.url_ids[0] + url_2 = setup_info.url_ids[1] + + adb_client = db_data_creator.adb_client + url_user_1 = await adb_client.get_next_url_agency_for_annotation( + user_id=1, + batch_id=None + ) + assert url_user_1 is not None + + url_user_2 = await adb_client.get_next_url_agency_for_annotation( + user_id=2, + batch_id=None + ) + + assert url_user_2 is not None + + # Check that the URLs are the same + assert url_user_1 == url_user_2 + + # Annotate the URL + await adb_client.add_agency_manual_suggestion( + url_id=url_1, + user_id=1, + is_new=True, + agency_id=None + ) + + # Both users should receive the next URL + next_url_user_1 = await adb_client.get_next_url_agency_for_annotation( + user_id=1, + batch_id=None + ) + assert next_url_user_1 is not None + + next_url_user_2 = await adb_client.get_next_url_agency_for_annotation( + user_id=2, + batch_id=None + ) + assert next_url_user_2 is not None + + assert url_user_1 != next_url_user_1 + assert next_url_user_1 == next_url_user_2 diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 1c1289e7..8fb9f4a5 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -1,4 +1,3 @@ -import types from copy import deepcopy from typing import Optional from unittest.mock import MagicMock, AsyncMock, patch @@ -11,7 +10,7 @@ from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo -from core.classes.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator +from core.classes.task_operators.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator from core.classes.subtasks.AutoGooglerAgencyIdentificationSubtask import AutoGooglerAgencyIdentificationSubtask from core.classes.subtasks.CKANAgencyIdentificationSubtask import CKANAgencyIdentificationSubtask from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask diff --git a/tests/test_automated/integration/tasks/test_example_task.py b/tests/test_automated/integration/tasks/test_example_task.py index 819d0dc0..2211458c 100644 --- a/tests/test_automated/integration/tasks/test_example_task.py +++ b/tests/test_automated/integration/tasks/test_example_task.py @@ -4,8 +4,7 @@ from collector_db.enums import TaskType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome -from core.classes.TaskOperatorBase import TaskOperatorBase -from core.enums import BatchStatus +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase from tests.helpers.DBDataCreator import DBDataCreator class ExampleTaskOperator(TaskOperatorBase): diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index b15ff9d5..2d3aa192 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -8,7 +8,7 @@ from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome -from core.classes.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator +from core.classes.task_operators.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator from core.enums import RecordType, SubmitResponseStatus from tests.helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator from pdap_api_client.AccessManager import AccessManager diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py index 3839d0a6..4c33016b 100644 --- a/tests/test_automated/integration/tasks/test_url_html_task.py +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -6,7 +6,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome -from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator +from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator from tests.helpers.DBDataCreator import DBDataCreator from html_tag_collector.DataClassTags import ResponseHTMLInfo from html_tag_collector.ResponseParser import HTMLResponseParser diff --git a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py index 818d5aef..526efa70 100644 --- a/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py +++ b/tests/test_automated/integration/tasks/test_url_miscellaneous_metadata_task.py @@ -5,7 +5,7 @@ from collector_db.models import URL, URLOptionalDataSourceMetadata from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome -from core.classes.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator +from core.classes.task_operators.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator from tests.helpers.DBDataCreator import DBDataCreator diff --git a/tests/test_automated/integration/tasks/test_url_record_type_task.py b/tests/test_automated/integration/tasks/test_url_record_type_task.py index c56acec1..c941bcf7 100644 --- a/tests/test_automated/integration/tasks/test_url_record_type_task.py +++ b/tests/test_automated/integration/tasks/test_url_record_type_task.py @@ -5,8 +5,8 @@ from collector_db.enums import TaskType from collector_db.models import AutoRecordTypeSuggestion from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome -from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator -from core.enums import RecordType, BatchStatus +from core.classes.task_operators.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator +from core.enums import RecordType from tests.helpers.DBDataCreator import DBDataCreator from llm_api_logic.DeepSeekRecordClassifier import DeepSeekRecordClassifier diff --git a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index 11ef770a..abe15965 100644 --- a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -4,10 +4,9 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLWithHTML import URLWithHTML -from collector_db.enums import ValidationStatus, ValidationSource from collector_db.models import AutoRelevantSuggestion from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome -from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator +from core.classes.task_operators.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator from tests.helpers.assert_functions import assert_database_has_no_tasks from hugging_face.HuggingFaceInterface import HuggingFaceInterface From b5605945dab955087ffc81814f71753859cb0eab Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 16 Apr 2025 20:22:47 -0400 Subject: [PATCH 111/182] fix(tests): fix broken tests --- tests/helpers/complex_test_data_functions.py | 13 ++--- .../integration/api/test_review.py | 10 +--- .../collector_db/test_db_client.py | 47 ++----------------- 3 files changed, 9 insertions(+), 61 deletions(-) diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 18d3f92a..955e1cf6 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -1,3 +1,5 @@ +from typing import Optional + from pydantic import BaseModel from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo @@ -57,7 +59,7 @@ class FinalReviewSetupInfo(BaseModel): async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, - annotation_count: int, + annotation_count: Optional[int] = None, include_user_annotations: bool = True, include_miscellaneous_metadata: bool = True ) -> FinalReviewSetupInfo: @@ -109,16 +111,9 @@ async def add_relevant_suggestion(count: int, relevant: bool): ) if include_user_annotations: - await add_relevant_suggestion(annotation_count, True) await add_relevant_suggestion(1, False) - await add_record_type_suggestion(3, RecordType.ARREST_RECORDS) - await add_record_type_suggestion(2, RecordType.DISPATCH_RECORDINGS) await add_record_type_suggestion(1, RecordType.ACCIDENT_REPORTS) - - if include_user_annotations: - # Add user suggestions for agencies, one suggested by 3 users, another by 2, another by 1 - for i in range(annotation_count): - await add_agency_suggestion(i + 1) + await add_agency_suggestion(1) return FinalReviewSetupInfo( batch_id=batch_id, diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index 61b1ef7e..494765b6 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -15,7 +15,6 @@ async def test_review_next_source(api_test_helper): setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, - annotation_count=3, include_user_annotations=True ) url_mapping = setup_info.url_mapping @@ -47,16 +46,13 @@ async def test_review_next_source(api_test_helper): annotation_info = result.annotations relevant_info = annotation_info.relevant assert relevant_info.auto == True - assert relevant_info.users.relevant == 3 assert relevant_info.users.not_relevant == 1 record_type_info = annotation_info.record_type assert record_type_info.auto == RecordType.ARREST_RECORDS user_d = record_type_info.users - assert user_d[RecordType.ARREST_RECORDS] == 3 - assert user_d[RecordType.DISPATCH_RECORDINGS] == 2 assert user_d[RecordType.ACCIDENT_REPORTS] == 1 - assert list(user_d.keys()) == [RecordType.ARREST_RECORDS, RecordType.DISPATCH_RECORDINGS, RecordType.ACCIDENT_REPORTS] + assert list(user_d.keys()) == [RecordType.ACCIDENT_REPORTS] agency_info = annotation_info.agency @@ -67,9 +63,7 @@ async def test_review_next_source(api_test_helper): # Check user agency suggestions exist and in descending order of count user_agency_suggestions = agency_info.users user_agency_suggestions_as_list = list(user_agency_suggestions.values()) - assert len(user_agency_suggestions_as_list) == 3 - for i in range(3): - assert user_agency_suggestions_as_list[i].count == 3 - i + assert len(user_agency_suggestions_as_list) == 1 # Check confirmed agencies exist confirmed_agencies = agency_info.confirmed diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 0de7e16c..71bed7b4 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -162,7 +162,7 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, - annotation_count=3, + annotation_count=1, include_user_annotations=True ) @@ -186,16 +186,13 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato annotation_info = result.annotations relevant_info = annotation_info.relevant assert relevant_info.auto == True - assert relevant_info.users.relevant == 3 assert relevant_info.users.not_relevant == 1 record_type_info = annotation_info.record_type assert record_type_info.auto == RecordType.ARREST_RECORDS user_d = record_type_info.users - assert user_d[RecordType.ARREST_RECORDS] == 3 - assert user_d[RecordType.DISPATCH_RECORDINGS] == 2 assert user_d[RecordType.ACCIDENT_REPORTS] == 1 - assert list(user_d.keys()) == [RecordType.ARREST_RECORDS, RecordType.DISPATCH_RECORDINGS, RecordType.ACCIDENT_REPORTS] + assert list(user_d.keys()) == [RecordType.ACCIDENT_REPORTS] agency_info = annotation_info.agency @@ -206,9 +203,7 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato # Check user agency suggestions exist and in descending order of count user_agency_suggestions = agency_info.users user_agency_suggestions_as_list = list(user_agency_suggestions.values()) - assert len(user_agency_suggestions_as_list) == 3 - for i in range(3): - assert user_agency_suggestions_as_list[i].count == 3 - i + assert len(user_agency_suggestions_as_list) == 1 @pytest.mark.asyncio async def test_get_next_url_for_final_review_batch_id_filtering(db_data_creator: DBDataCreator): @@ -280,42 +275,6 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat -@pytest.mark.asyncio -async def test_get_next_url_for_final_review_favor_more_annotations( - db_data_creator: DBDataCreator, - wipe_database -): - """ - Test in the case of two URLs with the same number of components annotated, favoring the one with more total annotations - """ - setup_info_lower_count = await setup_for_get_next_url_for_final_review( - db_data_creator=db_data_creator, - annotation_count=1, - include_user_annotations=True - ) - url_mapping_lower_count = setup_info_lower_count.url_mapping - - setup_info_higher_count = await setup_for_get_next_url_for_final_review( - db_data_creator=db_data_creator, - annotation_count=3, - include_user_annotations=True - ) - url_mapping_higher_count = setup_info_higher_count.url_mapping - - for url_mapping in [url_mapping_lower_count, url_mapping_higher_count]: - await db_data_creator.agency_confirmed_suggestion( - url_id=url_mapping.url_id - ) - - result = await db_data_creator.adb_client.get_next_url_for_final_review( - batch_id=None - ) - - assert result.id == url_mapping_higher_count.url_id - - assert result.annotations.agency.confirmed is not None - - @pytest.mark.asyncio From 345a257410a33cd02e004c267c45128f77678ca8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 08:01:41 -0400 Subject: [PATCH 112/182] feat(api): adjust final review to reflect single user annotations Previously, the final review showed multiple user annotations for a URL. Now, because each URL can only have one user for each type of annotation, the endpoint has been updated. --- collector_db/AsyncDatabaseClient.py | 31 ++------ collector_db/DTOConverter.py | 78 +++++++------------ collector_db/models.py | 18 ++--- core/DTOs/GetNextURLForFinalReviewResponse.py | 23 +++--- tests/helpers/complex_test_data_functions.py | 50 ++++++------ .../integration/api/test_review.py | 16 ++-- .../collector_db/test_db_client.py | 19 ++--- 7 files changed, 92 insertions(+), 143 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index c8b4a204..8ceda774 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1065,19 +1065,6 @@ def count_subquery(model: Type[Base]): ) ) - count_subqueries = [ - count_subquery(model=model) - for model in models - ] - - sum_of_count_subqueries = ( - sum( - [ - coalesce(subquery.c.count, 0) - for subquery in count_subqueries - ] - ) - ) # Basic URL query url_query = ( @@ -1086,13 +1073,10 @@ def count_subquery(model: Type[Base]): ( sum_of_exist_subqueries ).label("total_distinct_annotation_count"), - ( - sum_of_count_subqueries - ).label("total_overall_annotation_count") ) ) - for subquery in (exist_subqueries + count_subqueries): + for subquery in exist_subqueries: url_query = url_query.outerjoin( subquery, URL.id == subquery.c.url_id ) @@ -1110,8 +1094,8 @@ def count_subquery(model: Type[Base]): URL.html_content, URL.auto_record_type_suggestion, URL.auto_relevant_suggestion, - URL.user_relevant_suggestions, - URL.user_record_type_suggestions, + URL.user_relevant_suggestion, + URL.user_record_type_suggestion, URL.optional_data_source_metadata, ] @@ -1122,7 +1106,7 @@ def count_subquery(model: Type[Base]): # The below relationships are joined to entities that are joined to the URL double_join_relationships = [ (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), - (URL.user_agency_suggestions, UserUrlAgencySuggestion.agency), + (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, ConfirmedURLAgency.agency) ] for primary, secondary in double_join_relationships: @@ -1134,7 +1118,6 @@ def count_subquery(model: Type[Base]): # Apply order clause url_query = url_query.order_by( desc("total_distinct_annotation_count"), - desc("total_overall_annotation_count"), asc(URL.id) ) @@ -1173,16 +1156,16 @@ def count_subquery(model: Type[Base]): description=result.description, annotations=FinalReviewAnnotationInfo( relevant=DTOConverter.final_review_annotation_relevant_info( - user_suggestions=result.user_relevant_suggestions, + user_suggestion=result.user_relevant_suggestion, auto_suggestion=result.auto_relevant_suggestion ), record_type=DTOConverter.final_review_annotation_record_type_info( - user_suggestions=result.user_record_type_suggestions, + user_suggestion=result.user_record_type_suggestion, auto_suggestion=result.auto_record_type_suggestion ), agency=DTOConverter.final_review_annotation_agency_info( automated_agency_suggestions=result.automated_agency_suggestions, - user_agency_suggestions=result.user_agency_suggestions, + user_agency_suggestion=result.user_agency_suggestion, confirmed_agencies=result.confirmed_agencies ) ), diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index 0d2856cf..2b6cf521 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -2,16 +2,15 @@ from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType, URLHTMLContentInfo from collector_db.DTOs.URLWithHTML import URLWithHTML -from collector_db.enums import ValidationStatus, ValidationSource, URLMetadataAttributeType from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent, URL, Agency, \ AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion, \ ConfirmedURLAgency from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo from core.DTOs.GetNextURLForFinalReviewResponse import FinalReviewAnnotationRelevantInfo, \ - FinalReviewAnnotationRelevantUsersInfo, FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ - FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyUserInfo + FinalReviewAnnotationRecordTypeInfo, FinalReviewAnnotationAgencyAutoInfo, \ + FinalReviewAnnotationAgencyInfo from core.enums import RecordType, SuggestionType -from html_tag_collector.DataClassTags import convert_to_response_html_info, ResponseHTMLInfo, ENUM_TO_ATTRIBUTE_MAPPING +from html_tag_collector.DataClassTags import ResponseHTMLInfo, ENUM_TO_ATTRIBUTE_MAPPING class DTOConverter: @@ -21,49 +20,35 @@ class DTOConverter: @staticmethod def final_review_annotation_relevant_info( - user_suggestions: list[UserRelevantSuggestion], + user_suggestion: UserRelevantSuggestion, auto_suggestion: AutoRelevantSuggestion ) -> FinalReviewAnnotationRelevantInfo: auto_value = auto_suggestion.relevant if auto_suggestion else None - - relevant_count = 0 - not_relevant_count = 0 - for suggestion in user_suggestions: - if suggestion.relevant: - relevant_count += 1 - else: - not_relevant_count += 1 + user_value = user_suggestion.relevant if user_suggestion else None return FinalReviewAnnotationRelevantInfo( auto=auto_value, - users=FinalReviewAnnotationRelevantUsersInfo( - relevant=relevant_count, - not_relevant=not_relevant_count - ) + user=user_value ) @staticmethod def final_review_annotation_record_type_info( - user_suggestions: list[UserRecordTypeSuggestion], + user_suggestion: UserRecordTypeSuggestion, auto_suggestion: AutoRecordTypeSuggestion ): - user_count = {} if auto_suggestion is None: auto_value = None else: auto_value = RecordType(auto_suggestion.record_type) - for suggestion in user_suggestions: - value = RecordType(suggestion.record_type) - if value not in user_count: - user_count[value] = 0 - user_count[value] += 1 - # Sort users by count, descending - user_count = dict(sorted(user_count.items(), key=lambda x: x[1], reverse=True)) + if user_suggestion is None: + user_value = None + else: + user_value = RecordType(user_suggestion.record_type) return FinalReviewAnnotationRecordTypeInfo( auto=auto_value, - users=user_count + user=user_value ) @staticmethod @@ -109,27 +94,20 @@ def final_review_annotation_agency_auto_info( @staticmethod def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_url_agency_suggestions: list[UserUrlAgencySuggestion] - ) -> dict[int, FinalReviewAnnotationAgencyUserInfo]: - d = {} - for suggestion in user_url_agency_suggestions: - agency = suggestion.agency - agency_id = agency.agency_id - if agency.agency_id not in d: - d[agency_id] = FinalReviewAnnotationAgencyUserInfo( - suggestion_type=SuggestionType.MANUAL_SUGGESTION, - agency_name=agency.name, - pdap_agency_id=agency_id, - state=agency.state, - county=agency.county, - locality=agency.locality, - count=1 - ) - else: - d[agency_id].count += 1 + user_url_agency_suggestion: UserUrlAgencySuggestion + ) -> Optional[GetNextURLForAgencyAgencyInfo]: + suggestion = user_url_agency_suggestion + if suggestion is None: + return None + return GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.MANUAL_SUGGESTION, + pdap_agency_id=suggestion.agency_id, + agency_name=suggestion.agency.name, + state=suggestion.agency.state, + county=suggestion.agency.county, + locality=suggestion.agency.locality + ) - # Return sorted - return dict(sorted(d.items(), key=lambda x: x[1].count, reverse=True)) @staticmethod def confirmed_agencies_to_final_review_annotation_agency_info( @@ -154,7 +132,7 @@ def confirmed_agencies_to_final_review_annotation_agency_info( def final_review_annotation_agency_info( automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], confirmed_agencies: list[ConfirmedURLAgency], - user_agency_suggestions: list[UserUrlAgencySuggestion] + user_agency_suggestion: UserUrlAgencySuggestion ): confirmed_agency_info = DTOConverter.confirmed_agencies_to_final_review_annotation_agency_info( @@ -166,12 +144,12 @@ def final_review_annotation_agency_info( ) agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestions + user_agency_suggestion ) return FinalReviewAnnotationAgencyInfo( confirmed=confirmed_agency_info, - users=agency_user_info, + user=agency_user_info, auto=agency_auto_info ) diff --git a/collector_db/models.py b/collector_db/models.py index e98ef437..4ac117d6 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -119,16 +119,16 @@ class URL(Base): ) automated_agency_suggestions = relationship( "AutomatedUrlAgencySuggestion", back_populates="url") - user_agency_suggestions = relationship( - "UserUrlAgencySuggestion", back_populates="url") + user_agency_suggestion = relationship( + "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( "AutoRecordTypeSuggestion", uselist=False, back_populates="url") - user_record_type_suggestions = relationship( - "UserRecordTypeSuggestion", back_populates="url") + user_record_type_suggestion = relationship( + "UserRecordTypeSuggestion", uselist=False, back_populates="url") auto_relevant_suggestion = relationship( "AutoRelevantSuggestion", uselist=False, back_populates="url") - user_relevant_suggestions = relationship( - "UserRelevantSuggestion", back_populates="url") + user_relevant_suggestion = relationship( + "UserRelevantSuggestion", uselist=False, back_populates="url") reviewing_user = relationship( "ReviewingUserURL", uselist=False, back_populates="url") optional_data_source_metadata = relationship( @@ -375,7 +375,7 @@ class UserUrlAgencySuggestion(Base): is_new = Column(Boolean, nullable=True) agency = relationship("Agency", back_populates="user_suggestions") - url = relationship("URL", back_populates="user_agency_suggestions") + url = relationship("URL", back_populates="user_agency_suggestion") __table_args__ = ( UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), @@ -432,7 +432,7 @@ class UserRelevantSuggestion(Base): # Relationships - url = relationship("URL", back_populates="user_relevant_suggestions") + url = relationship("URL", back_populates="user_relevant_suggestion") class UserRecordTypeSuggestion(Base): @@ -451,4 +451,4 @@ class UserRecordTypeSuggestion(Base): # Relationships - url = relationship("URL", back_populates="user_record_type_suggestions") \ No newline at end of file + url = relationship("URL", back_populates="user_record_type_suggestion") \ No newline at end of file diff --git a/core/DTOs/GetNextURLForFinalReviewResponse.py b/core/DTOs/GetNextURLForFinalReviewResponse.py index f7f44e32..c9e838b6 100644 --- a/core/DTOs/GetNextURLForFinalReviewResponse.py +++ b/core/DTOs/GetNextURLForFinalReviewResponse.py @@ -6,26 +6,21 @@ from core.enums import RecordType from html_tag_collector.DataClassTags import ResponseHTMLInfo - -class FinalReviewAnnotationRelevantUsersInfo(BaseModel): - relevant: int = Field(title="Number of users who marked the URL as relevant") - not_relevant: int = Field(title="Number of users who marked the URL as not relevant") - class FinalReviewAnnotationRelevantInfo(BaseModel): auto: Optional[bool] = Field(title="Whether the auto-labeler has marked the URL as relevant") - users: FinalReviewAnnotationRelevantUsersInfo = Field( - title="How users identified the relevancy of the source", + user: Optional[bool] = Field( + title="Whether a user has marked the URL as relevant", ) class FinalReviewAnnotationRecordTypeInfo(BaseModel): - auto: Optional[RecordType] = Field(title="The record type suggested by the auto-labeler") - users: dict[RecordType, int] = Field( - title="A dictionary, sorted by size and omitting zero values, of all record types suggested by users", + auto: Optional[RecordType] = Field( + title="The record type suggested by the auto-labeler" + ) + user: Optional[RecordType] = Field( + title="The record type suggested by a user", ) # region Agency -class FinalReviewAnnotationAgencyUserInfo(GetNextURLForAgencyAgencyInfo): - count: int = Field(title="Number of times suggested by users") class FinalReviewAnnotationAgencyAutoInfo(BaseModel): unknown: bool = Field(title="Whether the auto-labeler suggested the URL as unknown") @@ -39,8 +34,8 @@ class FinalReviewAnnotationAgencyInfo(BaseModel): ) auto: Optional[FinalReviewAnnotationAgencyAutoInfo] = Field( title="A single agency or a list of agencies suggested by the auto-labeler",) - users: Optional[dict[int, FinalReviewAnnotationAgencyUserInfo]] = Field( - title="A list, sorted by size, of all agencies suggested by users", + user: Optional[GetNextURLForAgencyAgencyInfo] = Field( + title="A single agency suggested by a user", ) # endregion diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 955e1cf6..6f9ca7c3 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -56,6 +56,7 @@ async def setup_for_annotate_agency( class FinalReviewSetupInfo(BaseModel): batch_id: int url_mapping: URLMapping + user_agency_id: Optional[int] async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, @@ -78,27 +79,25 @@ async def setup_for_get_next_url_for_final_review( await db_data_creator.url_miscellaneous_metadata(url_id=url_mapping.url_id) await db_data_creator.html_data([url_mapping.url_id]) - async def add_agency_suggestion(count: int): + async def add_agency_suggestion() -> int: agency_id = await db_data_creator.agency() - for i in range(count): - await db_data_creator.agency_user_suggestions( - url_id=url_mapping.url_id, - agency_id=agency_id - ) - - async def add_record_type_suggestion(count: int, record_type: RecordType): - for i in range(count): - await db_data_creator.user_record_type_suggestion( - url_id=url_mapping.url_id, - record_type=record_type - ) - - async def add_relevant_suggestion(count: int, relevant: bool): - for i in range(count): - await db_data_creator.user_relevant_suggestion( - url_id=url_mapping.url_id, - relevant=relevant - ) + await db_data_creator.agency_user_suggestions( + url_id=url_mapping.url_id, + agency_id=agency_id + ) + return agency_id + + async def add_record_type_suggestion(record_type: RecordType): + await db_data_creator.user_record_type_suggestion( + url_id=url_mapping.url_id, + record_type=record_type + ) + + async def add_relevant_suggestion(relevant: bool): + await db_data_creator.user_relevant_suggestion( + url_id=url_mapping.url_id, + relevant=relevant + ) await db_data_creator.auto_relevant_suggestions( url_id=url_mapping.url_id, @@ -111,11 +110,14 @@ async def add_relevant_suggestion(count: int, relevant: bool): ) if include_user_annotations: - await add_relevant_suggestion(1, False) - await add_record_type_suggestion(1, RecordType.ACCIDENT_REPORTS) - await add_agency_suggestion(1) + await add_relevant_suggestion(False) + await add_record_type_suggestion(RecordType.ACCIDENT_REPORTS) + user_agency_id = await add_agency_suggestion() + else: + user_agency_id = None return FinalReviewSetupInfo( batch_id=batch_id, - url_mapping=url_mapping + url_mapping=url_mapping, + user_agency_id=user_agency_id ) diff --git a/tests/test_automated/integration/api/test_review.py b/tests/test_automated/integration/api/test_review.py index 494765b6..1f427c61 100644 --- a/tests/test_automated/integration/api/test_review.py +++ b/tests/test_automated/integration/api/test_review.py @@ -46,14 +46,11 @@ async def test_review_next_source(api_test_helper): annotation_info = result.annotations relevant_info = annotation_info.relevant assert relevant_info.auto == True - assert relevant_info.users.not_relevant == 1 + assert relevant_info.user == False record_type_info = annotation_info.record_type assert record_type_info.auto == RecordType.ARREST_RECORDS - user_d = record_type_info.users - assert user_d[RecordType.ACCIDENT_REPORTS] == 1 - assert list(user_d.keys()) == [RecordType.ACCIDENT_REPORTS] - + assert record_type_info.user == RecordType.ACCIDENT_REPORTS agency_info = annotation_info.agency auto_agency_suggestions = agency_info.auto @@ -61,9 +58,9 @@ async def test_review_next_source(api_test_helper): assert len(auto_agency_suggestions.suggestions) == 3 # Check user agency suggestions exist and in descending order of count - user_agency_suggestions = agency_info.users - user_agency_suggestions_as_list = list(user_agency_suggestions.values()) - assert len(user_agency_suggestions_as_list) == 1 + user_agency_suggestion = agency_info.user + assert user_agency_suggestion.pdap_agency_id == setup_info.user_agency_id + # Check confirmed agencies exist confirmed_agencies = agency_info.confirmed @@ -78,13 +75,12 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): setup_info = await setup_for_get_next_url_for_final_review( db_data_creator=db_data_creator, - annotation_count=3, include_user_annotations=True ) url_mapping = setup_info.url_mapping # Add confirmed agency - confirmed_agency = await db_data_creator.confirmed_suggestions([url_mapping.url_id]) + await db_data_creator.confirmed_suggestions([url_mapping.url_id]) # Additionally, include an agency not yet included in the database additional_agency = 999999 diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 71bed7b4..5ea0bee2 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -186,24 +186,20 @@ async def test_get_next_url_for_final_review_basic(db_data_creator: DBDataCreato annotation_info = result.annotations relevant_info = annotation_info.relevant assert relevant_info.auto == True - assert relevant_info.users.not_relevant == 1 + assert relevant_info.user == False record_type_info = annotation_info.record_type assert record_type_info.auto == RecordType.ARREST_RECORDS - user_d = record_type_info.users - assert user_d[RecordType.ACCIDENT_REPORTS] == 1 - assert list(user_d.keys()) == [RecordType.ACCIDENT_REPORTS] - + assert record_type_info.user == RecordType.ACCIDENT_REPORTS agency_info = annotation_info.agency auto_agency_suggestions = agency_info.auto assert auto_agency_suggestions.unknown == False assert len(auto_agency_suggestions.suggestions) == 3 - # Check user agency suggestions exist and in descending order of count - user_agency_suggestions = agency_info.users - user_agency_suggestions_as_list = list(user_agency_suggestions.values()) - assert len(user_agency_suggestions_as_list) == 1 + # Check user agency suggestion exists and is correct + assert agency_info.user.pdap_agency_id == setup_info.user_agency_id + @pytest.mark.asyncio async def test_get_next_url_for_final_review_batch_id_filtering(db_data_creator: DBDataCreator): @@ -301,12 +297,11 @@ async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBD record_type = annotations.record_type assert record_type.auto is None - assert record_type.users == {} + assert record_type.user is None relevant = annotations.relevant assert relevant.auto is None - assert relevant.users.relevant == 0 - assert relevant.users.not_relevant == 0 + assert relevant.user is None @pytest.mark.asyncio async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator: DBDataCreator): From 6858bc0000f4d454b9c8ea228ff81ff76a8a5811 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 09:21:23 -0400 Subject: [PATCH 113/182] feat(app): change batch status `completed` to `ready to label` --- ...hange_batch_completed_to_ready_to_label.py | 36 +++++++++++++++++++ collector_db/StatementComposer.py | 2 +- collector_db/models.py | 2 +- collector_manager/AsyncCollectorBase.py | 2 +- core/TaskManager.py | 2 +- core/enums.py | 2 +- .../lifecycle/test_auto_googler_lifecycle.py | 2 +- .../core/lifecycle/test_ckan_lifecycle.py | 2 +- .../test_common_crawler_lifecycle.py | 2 +- .../lifecycle/test_muckrock_lifecycles.py | 6 ++-- .../integration/api/test_example_collector.py | 6 ++-- .../integration/core/test_async_core.py | 4 +-- .../core/test_example_collector_lifecycle.py | 6 ++-- util/alembic_helpers.py | 13 ++++++- 14 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 alembic/versions/2025_04_17_0909-e285e6e7cf71_change_batch_completed_to_ready_to_label.py diff --git a/alembic/versions/2025_04_17_0909-e285e6e7cf71_change_batch_completed_to_ready_to_label.py b/alembic/versions/2025_04_17_0909-e285e6e7cf71_change_batch_completed_to_ready_to_label.py new file mode 100644 index 00000000..882c2c5f --- /dev/null +++ b/alembic/versions/2025_04_17_0909-e285e6e7cf71_change_batch_completed_to_ready_to_label.py @@ -0,0 +1,36 @@ +"""Change batch completed to ready to label + +Revision ID: e285e6e7cf71 +Revises: 997f5bf53772 +Create Date: 2025-04-17 09:09:38.137131 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type, alter_enum_value + +# revision identifiers, used by Alembic. +revision: str = 'e285e6e7cf71' +down_revision: Union[str, None] = '997f5bf53772' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + alter_enum_value( + enum_name="batch_status", + old_value="complete", + new_value="ready to label" + ) + + + +def downgrade() -> None: + alter_enum_value( + enum_name="batch_status", + old_value="ready to label", + new_value="complete" + ) diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index d108a3fa..e25ba5d4 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -22,7 +22,7 @@ def pending_urls_without_html_data() -> Select: join(Task, LinkTaskURL.task_id == Task.id). where(LinkTaskURL.url_id == URL.id). where(Task.task_type == TaskType.HTML.value). - where(Task.task_status == BatchStatus.COMPLETE.value) + where(Task.task_status == BatchStatus.READY_TO_LABEL.value) ) query = ( select(URL). diff --git a/collector_db/models.py b/collector_db/models.py index 4ac117d6..42b113c6 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -17,7 +17,7 @@ CURRENT_TIME_SERVER_DEFAULT = func.now() -batch_status_enum = PGEnum('complete', 'error', 'in-process', 'aborted', name='batch_status') +batch_status_enum = PGEnum('ready to label', 'error', 'in-process', 'aborted', name='batch_status') record_type_values = get_enum_values(RecordType) diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index fe260266..099f5338 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -131,4 +131,4 @@ async def log( )) async def close(self) -> None: - self.status = BatchStatus.COMPLETE + self.status = BatchStatus.READY_TO_LABEL diff --git a/core/TaskManager.py b/core/TaskManager.py index 8a40b129..429375c2 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -159,7 +159,7 @@ async def handle_outcome(self, run_info: TaskOperatorRunInfo): case TaskOperatorOutcome.SUCCESS: await self.adb_client.update_task_status( task_id=run_info.task_id, - status=BatchStatus.COMPLETE + status=BatchStatus.READY_TO_LABEL ) async def handle_task_error(self, run_info: TaskOperatorRunInfo): diff --git a/core/enums.py b/core/enums.py index cfccbb92..714b1d03 100644 --- a/core/enums.py +++ b/core/enums.py @@ -2,7 +2,7 @@ class BatchStatus(Enum): - COMPLETE = "complete" + READY_TO_LABEL = "ready to label" IN_PROCESS = "in-process" ERROR = "error" ABORTED = "aborted" diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index f2b2c098..9e5c0e49 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -32,7 +32,7 @@ def test_auto_googler_collector_lifecycle(test_core): batch_info: BatchInfo = api.dependencies.db_client.get_batch_by_id(1) assert batch_info.strategy == "auto_googler" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count == 20 url_infos = db_client.get_urls_by_batch(1) diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 575dedfa..4e87bbbd 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -24,7 +24,7 @@ def test_ckan_lifecycle(test_core): batch_info: BatchInfo = db_client.get_batch_by_id(1) assert batch_info.strategy == "ckan" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count >= 3000 url_infos = db_client.get_urls_by_batch(1) diff --git a/tests/manual/core/lifecycle/test_common_crawler_lifecycle.py b/tests/manual/core/lifecycle/test_common_crawler_lifecycle.py index d2ee4495..03fe5855 100644 --- a/tests/manual/core/lifecycle/test_common_crawler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_common_crawler_lifecycle.py @@ -34,7 +34,7 @@ def test_common_crawler_lifecycle(test_core: SourceCollectorCore): batch_info = db_client.get_batch_by_id(1) assert batch_info.strategy == "common_crawler" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.parameters == config url_infos = db_client.get_urls_by_batch(1) diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index b688b0a8..72d2d9fc 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -23,7 +23,7 @@ def test_muckrock_simple_search_collector_lifecycle(test_core): batch_info: BatchInfo = db_client.get_batch_by_id(1) assert batch_info.strategy == "muckrock_simple_search" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count >= 10 url_infos = db_client.get_urls_by_batch(1) @@ -45,7 +45,7 @@ def test_muckrock_county_level_search_collector_lifecycle(test_core): batch_info: BatchInfo = db_client.get_batch_by_id(1) assert batch_info.strategy == "muckrock_county_search" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count >= 10 url_infos = db_client.get_urls_by_batch(1) @@ -67,7 +67,7 @@ def test_muckrock_full_search_collector_lifecycle(test_core): batch_info: BatchInfo = db_client.get_batch_by_id(1) assert batch_info.strategy == CollectorType.MUCKROCK_ALL_SEARCH.value - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count >= 1 url_infos = db_client.get_urls_by_batch(1) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index d1466c8c..b13f7e31 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -54,7 +54,7 @@ async def test_example_collector(api_test_helper): csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, - status=BatchStatus.COMPLETE + status=BatchStatus.READY_TO_LABEL ) assert len(csr.results) == 1 @@ -62,10 +62,10 @@ async def test_example_collector(api_test_helper): assert bsi.id == batch_id assert bsi.strategy == CollectorType.EXAMPLE.value - assert bsi.status == BatchStatus.COMPLETE + assert bsi.status == BatchStatus.READY_TO_LABEL bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) - assert bi.status == BatchStatus.COMPLETE + assert bi.status == BatchStatus.READY_TO_LABEL assert bi.total_url_count == 2 assert bi.parameters == dto.model_dump() assert bi.strategy == CollectorType.EXAMPLE.value diff --git a/tests/test_automated/integration/core/test_async_core.py b/tests/test_automated/integration/core/test_async_core.py index ed314dfd..f2125865 100644 --- a/tests/test_automated/integration/core/test_async_core.py +++ b/tests/test_automated/integration/core/test_async_core.py @@ -44,7 +44,7 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): task_info = await ddc.adb_client.get_task_info(task_id=task_id) - assert task_info.task_status == BatchStatus.COMPLETE + assert task_info.task_status == BatchStatus.READY_TO_LABEL assert len(task_info.urls) == 3 @pytest.mark.asyncio @@ -65,7 +65,7 @@ async def test_conclude_task_success(db_data_creator: DBDataCreator): task_info = await ddc.adb_client.get_task_info(task_id=task_id) - assert task_info.task_status == BatchStatus.COMPLETE + assert task_info.task_status == BatchStatus.READY_TO_LABEL assert len(task_info.urls) == 3 @pytest.mark.asyncio diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index d3f3f855..a9c4900f 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -41,11 +41,11 @@ async def test_example_collector_lifecycle( await asyncio.sleep(1.5) await acore.collector_manager.logger.flush_all() print("Done sleeping...") - assert core.get_status(batch_id) == BatchStatus.COMPLETE + assert core.get_status(batch_id) == BatchStatus.READY_TO_LABEL batch_info: BatchInfo = db_client.get_batch_by_id(batch_id) assert batch_info.strategy == "example" - assert batch_info.status == BatchStatus.COMPLETE + assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count == 2 assert batch_info.parameters == dto.model_dump() assert batch_info.compute_time > 1 @@ -90,4 +90,4 @@ async def test_example_collector_lifecycle_multiple_batches( await asyncio.sleep(3) for csi in csis: - assert core.get_status(csi.batch_id) == BatchStatus.COMPLETE + assert core.get_status(csi.batch_id) == BatchStatus.READY_TO_LABEL diff --git a/util/alembic_helpers.py b/util/alembic_helpers.py index d2120634..84cdbfa7 100644 --- a/util/alembic_helpers.py +++ b/util/alembic_helpers.py @@ -6,7 +6,8 @@ def switch_enum_type( column_name, enum_name, new_enum_values, - drop_old_enum=True + drop_old_enum=True, + cast_dict: dict = None ): """ Switches an ENUM type in a PostgreSQL column by: @@ -36,3 +37,13 @@ def switch_enum_type( # Drop the old enum type if drop_old_enum: op.execute(f'DROP TYPE "{old_enum_temp_name}"') + +def alter_enum_value( + enum_name, + old_value, + new_value +): + """ + Changes one value of an enum type + """ + op.execute(f"ALTER TYPE {enum_name} RENAME VALUE '{old_value}' TO '{new_value}'") \ No newline at end of file From c18bd686865312c44c9c97d35eccc1a90722209d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 14:33:33 -0400 Subject: [PATCH 114/182] feat(app): Add `/batch` filter for batches with pending URLs --- api/routes/batch.py | 14 +++- collector_db/AsyncDatabaseClient.py | 28 ++++++- core/AsyncCore.py | 4 +- tests/helpers/DBDataCreator.py | 31 ++++++-- .../api/helpers/RequestValidator.py | 10 ++- .../integration/api/test_batch.py | 77 +++++++++++++++++++ .../collector_db/test_db_client.py | 4 - 7 files changed, 148 insertions(+), 20 deletions(-) diff --git a/api/routes/batch.py b/api/routes/batch.py index 9d4b62cc..2c791503 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -1,11 +1,10 @@ from typing import Optional -from fastapi import Path, APIRouter, HTTPException +from fastapi import Path, APIRouter from fastapi.params import Query, Depends from api.dependencies import get_core, get_async_core from collector_db.DTOs.BatchInfo import BatchInfo -from collector_manager.CollectorManager import InvalidCollectorError from collector_manager.enums import CollectorType from core.AsyncCore import AsyncCore from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse @@ -34,6 +33,10 @@ async def get_batch_status( description="Filter by status", default=None ), + has_pending_urls: Optional[bool] = Query( + description="Filter by whether the batch has pending URLs", + default=None + ), page: int = Query( description="The page number", default=1 @@ -44,7 +47,12 @@ async def get_batch_status( """ Get the status of recent batches """ - return await core.get_batch_statuses(collector_type=collector_type, status=status, page=page) + return await core.get_batch_statuses( + collector_type=collector_type, + status=status, + has_pending_urls=has_pending_urls, + page=page + ) @batch_router.get("/{batch_id}") diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 8ceda774..957a4eb6 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1567,17 +1567,37 @@ async def get_recent_batch_status_info( page: int, collector_type: Optional[CollectorType] = None, status: Optional[BatchStatus] = None, + has_pending_urls: Optional[bool] = None ) -> List[BatchInfo]: # Get only the batch_id, collector_type, status, and created_at limit = 100 - query = (Select(Batch) - .order_by(Batch.date_generated.desc())) + query = Select(Batch) + if has_pending_urls is not None: + if has_pending_urls: + # Query for all that have pending URLs + query = query.join(URL, Batch.id == URL.batch_id).filter(URL.outcome == URLStatus.PENDING.value) + else: + # Query for all that DO NOT have pending URLs + # (or that have no URLs at all) + query = query.join( + URL, + Batch.id == URL.batch_id, + isouter=True + ).filter( + or_( + URL.outcome != URLStatus.PENDING.value, + URL.outcome.is_(None) + ) + ) if collector_type: query = query.filter(Batch.strategy == collector_type.value) if status: query = query.filter(Batch.status == status.value) - query = (query.limit(limit) - .offset((page - 1) * limit)) + + query = (query. + order_by(Batch.date_generated.desc()). + limit(limit). + offset((page - 1) * limit)) raw_results = await session.execute(query) batches = raw_results.scalars().all() return [BatchInfo(**batch.__dict__) for batch in batches] diff --git a/core/AsyncCore.py b/core/AsyncCore.py index cb9a80bc..d436d3c9 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -67,12 +67,14 @@ async def get_batch_statuses( self, collector_type: Optional[CollectorType], status: Optional[BatchStatus], + has_pending_urls: Optional[bool], page: int ) -> GetBatchStatusResponse: results = await self.adb_client.get_recent_batch_status_info( collector_type=collector_type, status=status, - page=page + page=page, + has_pending_urls=has_pending_urls ) return GetBatchStatusResponse(results=results) diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 613bfe4d..28d8a573 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -36,11 +36,15 @@ def __init__(self, db_client: Optional[DatabaseClient] = None): self.db_client = DatabaseClient() self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() - def batch(self, strategy: CollectorType = CollectorType.EXAMPLE) -> int: + def batch( + self, + strategy: CollectorType = CollectorType.EXAMPLE, + batch_status: BatchStatus = BatchStatus.IN_PROCESS + ) -> int: return self.db_client.insert_batch( BatchInfo( strategy=strategy.value, - status=BatchStatus.IN_PROCESS, + status=batch_status, total_url_count=1, parameters={"test_key": "test_value"}, user_id=1 @@ -56,11 +60,26 @@ async def task(self, url_ids: Optional[list[int]] = None) -> int: async def batch_and_urls( self, strategy: CollectorType = CollectorType.EXAMPLE, - url_count: int = 1, - with_html_content: bool = False + url_count: int = 3, + with_html_content: bool = False, + batch_status: BatchStatus = BatchStatus.READY_TO_LABEL, + url_status: URLStatus = URLStatus.PENDING ) -> BatchURLCreationInfo: - batch_id = self.batch(strategy=strategy) - iuis: InsertURLsInfo = self.urls(batch_id=batch_id, url_count=url_count) + batch_id = self.batch( + strategy=strategy, + batch_status=batch_status + ) + if batch_status in (BatchStatus.ERROR, BatchStatus.ABORTED): + return BatchURLCreationInfo( + batch_id=batch_id, + url_ids=[], + urls=[] + ) + iuis: InsertURLsInfo = self.urls( + batch_id=batch_id, + url_count=url_count, + outcome=url_status + ) url_ids = [iui.url_id for iui in iuis.url_mappings] if with_html_content: await self.html_data(url_ids) diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index f8ada6ae..4a12bb0e 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -120,13 +120,19 @@ def delete( expected_response=expected_response, **kwargs) - def get_batch_statuses(self, collector_type: Optional[CollectorType] = None, status: Optional[BatchStatus] = None) -> GetBatchStatusResponse: + def get_batch_statuses( + self, + collector_type: Optional[CollectorType] = None, + status: Optional[BatchStatus] = None, + has_pending_urls: Optional[bool] = None + ) -> GetBatchStatusResponse: params = {} update_if_not_none( target=params, source={ "collector_type": collector_type.value if collector_type else None, - "status": status.value if status else None + "status": status.value if status else None, + "has_pending_urls": has_pending_urls } ) data = self.get( diff --git a/tests/test_automated/integration/api/test_batch.py b/tests/test_automated/integration/api/test_batch.py index 604e2d67..bc86dfec 100644 --- a/tests/test_automated/integration/api/test_batch.py +++ b/tests/test_automated/integration/api/test_batch.py @@ -1,11 +1,88 @@ import asyncio import time +import pytest + from collector_db.DTOs.BatchInfo import BatchInfo from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO +from collector_manager.enums import CollectorType, URLStatus from core.enums import BatchStatus +@pytest.mark.asyncio +async def test_get_batch_status_pending_url_filter(api_test_helper): + ath = api_test_helper + + # Add an errored out batch + batch_error = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=1, + batch_status=BatchStatus.ERROR + ) + + # Add a batch with pending urls + batch_pending = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=1, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLStatus.PENDING + ) + + # Add a batch with submitted URLs + batch_submitted = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=1, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLStatus.SUBMITTED + ) + + # Add an aborted batch + batch_aborted = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=1, + batch_status=BatchStatus.ABORTED + ) + + # Add a batch with validated URLs + batch_validated = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=1, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLStatus.VALIDATED + ) + + # Test filter for pending URLs and only retrieve the second batch + pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=True + ) + + assert len(pending_urls_results.results) == 1 + assert pending_urls_results.results[0].id == batch_pending.batch_id + + # Test filter without pending URLs and retrieve the other four batches + no_pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=False + ) + + assert len(no_pending_urls_results.results) == 4 + for result in no_pending_urls_results.results: + assert result.id in [ + batch_error.batch_id, + batch_submitted.batch_id, + batch_validated.batch_id, + batch_aborted.batch_id + ] + + # Test no filter for pending URLs and retrieve all batches + no_filter_results = ath.request_validator.get_batch_statuses() + + assert len(no_filter_results.results) == 5 + + + def test_abort_batch(api_test_helper): ath = api_test_helper diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 5ea0bee2..5560577e 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -269,10 +269,6 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat assert result.id == url_mapping_with_user_anno.url_id - - - - @pytest.mark.asyncio async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBDataCreator): """ From 85a2883aef74df2fdccf87b4bc672d4465508e45 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 15:20:33 -0400 Subject: [PATCH 115/182] fix(database): fix duplicate bug in `/batch` get for `has_pending_urls` Previously, `has_pending_urls` was returning multiple instances of the same batches in some cases. This has been resolved. --- collector_db/AsyncDatabaseClient.py | 24 ++++++++++++------- tests/helpers/DBDataCreator.py | 2 +- .../integration/api/test_batch.py | 10 ++++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 957a4eb6..9e1ab473 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1573,20 +1573,26 @@ async def get_recent_batch_status_info( limit = 100 query = Select(Batch) if has_pending_urls is not None: + pending_url_subquery = Select(URL).where( + and_( + URL.batch_id == Batch.id, + URL.outcome == URLStatus.PENDING.value + ) + ) + if has_pending_urls: # Query for all that have pending URLs - query = query.join(URL, Batch.id == URL.batch_id).filter(URL.outcome == URLStatus.PENDING.value) + query = query.where(exists( + pending_url_subquery + )) else: # Query for all that DO NOT have pending URLs # (or that have no URLs at all) - query = query.join( - URL, - Batch.id == URL.batch_id, - isouter=True - ).filter( - or_( - URL.outcome != URLStatus.PENDING.value, - URL.outcome.is_(None) + query = query.where( + not_( + exists( + pending_url_subquery + ) ) ) if collector_type: diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 28d8a573..695a3c7a 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -2,7 +2,7 @@ from random import randint from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, model_validator from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo diff --git a/tests/test_automated/integration/api/test_batch.py b/tests/test_automated/integration/api/test_batch.py index bc86dfec..961b1a30 100644 --- a/tests/test_automated/integration/api/test_batch.py +++ b/tests/test_automated/integration/api/test_batch.py @@ -16,14 +16,14 @@ async def test_get_batch_status_pending_url_filter(api_test_helper): # Add an errored out batch batch_error = await ath.db_data_creator.batch_and_urls( strategy=CollectorType.EXAMPLE, - url_count=1, + url_count=2, batch_status=BatchStatus.ERROR ) # Add a batch with pending urls batch_pending = await ath.db_data_creator.batch_and_urls( strategy=CollectorType.EXAMPLE, - url_count=1, + url_count=2, batch_status=BatchStatus.READY_TO_LABEL, with_html_content=True, url_status=URLStatus.PENDING @@ -32,7 +32,7 @@ async def test_get_batch_status_pending_url_filter(api_test_helper): # Add a batch with submitted URLs batch_submitted = await ath.db_data_creator.batch_and_urls( strategy=CollectorType.EXAMPLE, - url_count=1, + url_count=2, batch_status=BatchStatus.READY_TO_LABEL, with_html_content=True, url_status=URLStatus.SUBMITTED @@ -41,14 +41,14 @@ async def test_get_batch_status_pending_url_filter(api_test_helper): # Add an aborted batch batch_aborted = await ath.db_data_creator.batch_and_urls( strategy=CollectorType.EXAMPLE, - url_count=1, + url_count=2, batch_status=BatchStatus.ABORTED ) # Add a batch with validated URLs batch_validated = await ath.db_data_creator.batch_and_urls( strategy=CollectorType.EXAMPLE, - url_count=1, + url_count=2, batch_status=BatchStatus.READY_TO_LABEL, with_html_content=True, url_status=URLStatus.VALIDATED From cfad874f3fb95a6dd9633a748f99227e59f0a1fc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 15:43:21 -0400 Subject: [PATCH 116/182] fix(app): Change suggestion type `MANUAL_SUGGESTION` to `USER_SUGGESTION` --- collector_db/DTOConverter.py | 2 +- core/enums.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index 2b6cf521..b43fbbe9 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -100,7 +100,7 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( if suggestion is None: return None return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.MANUAL_SUGGESTION, + suggestion_type=SuggestionType.USER_SUGGESTION, pdap_agency_id=suggestion.agency_id, agency_name=suggestion.agency.name, state=suggestion.agency.state, diff --git a/core/enums.py b/core/enums.py index 714b1d03..173c66e9 100644 --- a/core/enums.py +++ b/core/enums.py @@ -54,7 +54,7 @@ class SuggestionType(Enum): Identifies the specific kind of suggestion made for a URL """ AUTO_SUGGESTION = "Auto Suggestion" - MANUAL_SUGGESTION = "Manual Suggestion" + USER_SUGGESTION = "User Suggestion" UNKNOWN = "Unknown" NEW_AGENCY = "New Agency" CONFIRMED = "Confirmed" From 3519bf42f8948a7e15d52d2d08fac5dc59f9a703 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 16:36:40 -0400 Subject: [PATCH 117/182] refactor(app): remove deprecated and unused code --- .github/workflows/common_crawler.yaml | 40 -- .github/workflows/populate_labelstudio.yml | 94 ----- ENV.md | 3 - agency_identifier/README.md | 40 -- agency_identifier/__init__.py | 0 agency_identifier/identifier.py | 234 ----------- api/routes/batch.py | 1 - collector_db/enums.py | 1 - collector_manager/AsyncCollectorBase.py | 11 +- core/SourceCollectorCore.py | 2 - core/TaskManager.py | 2 +- .../MuckrockAgencyIdentificationSubtask.py | 2 +- .../AgencyIdentificationTaskOperator.py | 2 +- source_collectors/ckan/README.md | 22 -- source_collectors/ckan/main.py | 44 --- source_collectors/ckan/requirements.txt | 6 - source_collectors/ckan/schemas.py | 6 - .../ckan/scrape_ckan_data_portals.py | 25 -- source_collectors/common_crawler/README.md | 87 ----- source_collectors/common_crawler/argparser.py | 95 ----- source_collectors/common_crawler/cache.py | 93 ----- source_collectors/common_crawler/config.ini | 19 - .../common_crawler/csv_manager.py | 79 ---- .../common_crawler/data/cache.json | 7 - .../common_crawler/data/urls.csv | 207 ---------- source_collectors/common_crawler/main.py | 366 ------------------ .../requirements_common_crawler_action.txt | 3 - source_collectors/common_crawler/schemas.py | 22 -- .../muckrock}/MuckrockAPIInterface.py | 0 source_collectors/muckrock/README.md | 82 ---- .../muckrock/classes/SQLiteClient.py | 38 -- source_collectors/muckrock/muck_get.py | 16 - source_collectors/muckrock/requirements.txt | 30 -- .../test_muckrock_api_interface.py | 2 +- tests/manual/unsorted/test_identifier_unit.py | 275 ------------- .../integration/api/conftest.py | 2 - .../tasks/test_agency_preannotation_task.py | 2 +- 37 files changed, 10 insertions(+), 1950 deletions(-) delete mode 100644 .github/workflows/common_crawler.yaml delete mode 100644 .github/workflows/populate_labelstudio.yml delete mode 100644 agency_identifier/README.md delete mode 100644 agency_identifier/__init__.py delete mode 100644 agency_identifier/identifier.py delete mode 100644 source_collectors/ckan/main.py delete mode 100644 source_collectors/ckan/requirements.txt delete mode 100644 source_collectors/ckan/schemas.py delete mode 100644 source_collectors/common_crawler/README.md delete mode 100644 source_collectors/common_crawler/argparser.py delete mode 100644 source_collectors/common_crawler/cache.py delete mode 100644 source_collectors/common_crawler/config.ini delete mode 100644 source_collectors/common_crawler/csv_manager.py delete mode 100644 source_collectors/common_crawler/data/cache.json delete mode 100644 source_collectors/common_crawler/data/urls.csv delete mode 100644 source_collectors/common_crawler/main.py delete mode 100644 source_collectors/common_crawler/requirements_common_crawler_action.txt delete mode 100644 source_collectors/common_crawler/schemas.py rename {agency_identifier => source_collectors/muckrock}/MuckrockAPIInterface.py (100%) delete mode 100644 source_collectors/muckrock/classes/SQLiteClient.py delete mode 100644 source_collectors/muckrock/muck_get.py delete mode 100644 source_collectors/muckrock/requirements.txt delete mode 100644 tests/manual/unsorted/test_identifier_unit.py diff --git a/.github/workflows/common_crawler.yaml b/.github/workflows/common_crawler.yaml deleted file mode 100644 index 52b4007d..00000000 --- a/.github/workflows/common_crawler.yaml +++ /dev/null @@ -1,40 +0,0 @@ -name: Common Crawler - -# Pull request will run every day at 1AM. -on: - workflow_dispatch: -env: - # The access token enabling write access to the Huggingface Database - HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} - -jobs: - build-and-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - # This is necessary to push commits back to the repository - persist-credentials: true - fetch-depth: 0 # Fetch all history for all tags and branches - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.11.8 - - name: Upgrade pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: pip install -r source_collectors/common_crawler/requirements_common_crawler_action.txt - - name: Run script - run: python source_collectors/common_crawler/main.py CC-MAIN-2024-10 *.gov police --config source_collectors/common_crawler/config.ini --pages 20 - - name: Configure Git - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - - name: Add common_crawler cache and common_crawler batch_info - run: | - git add source_collectors/common_crawler/data/cache.json - git add source_collectors/common_crawler/data/batch_info.csv - - name: Commit changes - run: git commit -m "Update common_crawler cache and batch_info" - - name: Push changes - run: git push \ No newline at end of file diff --git a/.github/workflows/populate_labelstudio.yml b/.github/workflows/populate_labelstudio.yml deleted file mode 100644 index 09ca68b2..00000000 --- a/.github/workflows/populate_labelstudio.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: Populate LabelStudio - -on: - workflow_dispatch: - inputs: - crawl_id: - description: 'Common Crawl Corpus' - required: true - default: 'CC-MAIN-2024-10' - url: - description: 'URL type' - required: true - default: '*.gov' - keyword: - description: 'keyword' - required: true - default: 'police' - pages: - description: 'num pages' - required: true - default: '2' - record_type: - description: 'record type' - required: false - - -jobs: - run-script: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - ref: main - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r annotation_pipeline/requirements.txt - - - name: Run main script - env: - HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} - LABEL_STUDIO_ACCESS_TOKEN: ${{ secrets.LABEL_STUDIO_ACCESS_TOKEN }} - LABEL_STUDIO_PROJECT_ID: ${{ secrets.LABEL_STUDIO_PROJECT_ID }} - LABEL_STUDIO_ORGANIZATION: ${{ secrets.LABEL_STUDIO_ORGANIZATION }} - run: | - if [ -n "${{ github.event.inputs.record_type }}" ]; then - python annotation_pipeline/populate_labelstudio.py ${{ github.event.inputs.crawl_id }} "${{ github.event.inputs.url }}" ${{ github.event.inputs.keyword }} --pages ${{ github.event.inputs.pages }} --record_type "${{ github.event.inputs.record_type }}" - else - python annotation_pipeline/populate_labelstudio.py ${{ github.event.inputs.crawl_id }} "${{ github.event.inputs.url }}" ${{ github.event.inputs.keyword }} --pages ${{ github.event.inputs.pages }} - fi - - - name: Check created/modified files - run: | - echo "Checking files in annotation_pipeline/data/" - ls -R annotation_pipeline/data/ - - - name: Create new branch - run: | - BRANCH_NAME=bot-update-$(date +%Y%m%d%H%M%S) - git checkout -b $BRANCH_NAME - echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV - - - name: Commit and push outputs - run: | - git config --global user.name "github-actions[bot]" - git config --global user.email "action@github.com" - git add annotation_pipeline/data/batch_info.csv - git add annotation_pipeline/data/cache.json - if [ -d "annotation_pipeline/data/tag_collector" ]; then - git add annotation_pipeline/data/tag_collector/* - fi - git commit -m "Update batch info, cache, and collected urls & tags" - git log -1 --stat - git push --set-upstream origin $BRANCH_NAME - - - name: Create pull request - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH_NAME: ${{ env.BRANCH_NAME }} - run: | - PR_TITLE="Update batch info, cache, and collected urls & tags" - PR_BODY="This PR was created automatically by a GitHub Action." - echo "Creating PR from branch $BRANCH_NAME to main" - curl -X POST -H "Authorization: token $GITHUB_TOKEN" \ - -d "{\"title\":\"$PR_TITLE\",\"body\":\"$PR_BODY\",\"head\":\"$BRANCH_NAME\",\"base\":\"main\"}" \ - https://api.github.com/repos/${{ github.repository }}/pulls diff --git a/ENV.md b/ENV.md index 7c09fb64..5292320b 100644 --- a/ENV.md +++ b/ENV.md @@ -4,9 +4,6 @@ Please ensure these are properly defined in a `.env` file in the root directory. | Name | Description | Example | |----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| -| `LABEL_STUDIO_ACCESS_TOKEN` | The access token for the Label Studio API. The access token for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. | `abc123` | -| `LABEL_STUDIO_PROJECT_ID` | The project ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL, as in `https://app.heartex.com/projects/58475/` | `58475` | -| `LABEL_STUDIO_ORGANIZATION_ID` | The organization ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [Organization section](https://app.heartex.com/organization?page=1), where the organization ID can be copied. | `6758` | | `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | | `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | |`POSTGRES_USER` | The username for the test database | `test_source_collector_user` | diff --git a/agency_identifier/README.md b/agency_identifier/README.md deleted file mode 100644 index c1dadcf2..00000000 --- a/agency_identifier/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Agency Identifier - -The Agency Identifier is a Python application that matches URLs with an agency from the PDAP database. It takes a list of URLs as input, either from a CSV file or a DataFrame, and returns a DataFrame with the matched agencies. - -## How to use - -### Running from the command line - -1. Clone the repository. -2. Create a CSV file containing a list of URLs to be identified. The URLs should be listed one per line, and the file should have at least a "url" column. -3. Run the command `python3 identifier.py [url_file]`, replacing `[url_file]` with the path to your CSV file. -4. The results will be written to a file named `results.csv` in the same directory. - -### Using the "identifier_main" function - -If you're using the Agency Identifier in your own Python code, you can import the `process_and_write_data` function. This function takes a DataFrame as an argument and returns a DataFrame with the matched agencies. - -Here's an example of how to use it: - -```python -import polar as pl -from identifier import process_and_write_data - -# Create a DataFrame with the URLs to be identified -df = pl.DataFrame({"url": ["http://agency1.com/page1", "http://agency2.com/page2"]}) - -# Call the identifier_main function -result = process_and_write_data(df) - -# Print the resulting DataFrame -print(result) -``` - -# Requirements - -- Python 3 -- urllib -- re -- polars -- requests \ No newline at end of file diff --git a/agency_identifier/__init__.py b/agency_identifier/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/agency_identifier/identifier.py b/agency_identifier/identifier.py deleted file mode 100644 index 786aeba6..00000000 --- a/agency_identifier/identifier.py +++ /dev/null @@ -1,234 +0,0 @@ -import os -import re -import sys -from urllib.parse import urlparse - -import polars -import requests - -API_URL = "https://data-sources.pdap.io/api/agencies/" - - -def get_page_data(page: int) -> dict: - """Fetches a page of data from the API. - - Args: - page (int): The page number to fetch. - - Returns: - dict: The data for the page. - """ - api_key = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY") - response = requests.get(f"{API_URL}{page}", headers={"Authorization": api_key}) - if response.status_code != 200: - raise Exception("Request to PDAP API failed. Response code:", response.status_code) - return response.json()["data"] - - -def get_agencies_data() -> polars.DataFrame: - """Retrives a list of agency dictionaries from file. - - Returns: - list: List of agency dictionaries. - """ - page = 1 - agencies_df = polars.DataFrame() - results = get_page_data(page) - - while results: - # Use list comprehension to clean results - clean_results = clean_page_data_results(results) - new_agencies_df = polars.DataFrame(clean_results) - if not new_agencies_df.is_empty(): - agencies_df = polars.concat([agencies_df, new_agencies_df]) - page += 1 - results = get_page_data(page) - - return agencies_df - - -def clean_page_data_results(results: list[dict[str, str]]) -> list[dict[str, str]]: - clean_results = [] - for result in results: - clean_result = {} - for k, v in result.items(): - if v is None: - clean_result[k] = "" - else: - clean_result[k] = v - clean_results.append(clean_result) - return clean_results - - -def parse_hostname(url: str) -> str: - """Retrieves the hostname (example.com) from a url string. - - Args: - url (str): Url to parse. - - Returns: - str: The url's hostname. - """ - try: - # Remove leading and trailing whitespaces and quotes - url = url.strip().strip('"') - - # Add "http://" to the url if it's not present - if not re.match(r'http(s)?://', url): - url = "http://" + url - - # Parse the url and retrieve the hostname - parsed_url = urlparse(url) - hostname = parsed_url.hostname - - # Remove "www." from the hostname - hostname = re.sub(r'^www\.', '', hostname) - except Exception as e: - print(f"An error occurred while parsing the URL: {e}") - raise e - return hostname - - -def remove_http(url: str) -> str: - """Removes http(s)://www. from a given url so that different protocols don't throw off the matcher. - - Args: - url (str): Url to remove http from. - - Returns: - str: The url without http(s)://www. - """ - try: - # Remove http(s)://www. and www. prefixes from the url - url = re.sub(r'^(http(s)?://)?(www\.)?', '', url) - # Ensure the url ends with a / - if not url.endswith('/'): - url += '/' - except Exception as e: - print(f"An error occurred while processing the URL: {e}") - raise e - return url - - -def match_agencies(agencies, agency_hostnames, url): - """Attempts to match a url with an agency. - - Args: - agencies (list): List of agency dictionaries. - agency_hostnames (list): List of corresponding agency hostnames. - url (str): Url to match. - - Returns: - dict: Dictionary of a match in the form {"url": url, "agency": matched_agency}. - """ - url = url.strip().strip('"') - url_hostname = parse_hostname(url) - - if url_hostname in agency_hostnames: - # All agencies with the same hostname as the url are found - matched_agency = [ - agencies[i] for i, agency_hostname in enumerate(agency_hostnames) if url_hostname == agency_hostname - ] - else: - return {"url": url, "agency": [], "status": "No match found"} - - # More than one agency was found - if len(matched_agency) > 1: - url_no_http = remove_http(url) - - for agency in matched_agency: - agency_homepage = remove_http(agency["homepage_url"]) - # It is assumed that if the url begins with the agency's url, then it belongs to that agency - if url_no_http.startswith(agency_homepage): - return {"url": url, "agency": agency, "status": "Match found"} - break - - return {"url": url, "agency": [], "status": "Contested match"} - - return {"url": url, "agency": matched_agency[0], "status": "Match found"} - - -def match_urls_to_agencies_and_clean_data(urls_df: polars.DataFrame) -> polars.DataFrame: - agencies_df = get_agencies_data() - # Filter out agencies without a homepage_url set - # Define column names as variables for flexibility - homepage_url_col = "homepage_url" - hostname_col = "hostname" - count_data_sources_col = "count_data_sources" - max_data_sources_col = "max_data_sources" - - # Perform operations on DataFrame - try: - agencies_df = ( - agencies_df - # Filter out rows without a homepage_url - .filter(polars.col(homepage_url_col).is_not_null()) - .filter(polars.col(homepage_url_col) != "") - # Add a new column 'hostname' by applying the parse_hostname function to 'homepage_url' - .with_columns(polars.col(homepage_url_col).map_elements(parse_hostname).alias(hostname_col), - polars.col(count_data_sources_col).fill_null(0)) - # Add a new column 'max_data_sources' which is the max of 'count_data_sources' over 'hostname' - .with_columns(polars.col(count_data_sources_col).max().over(hostname_col).alias(max_data_sources_col)) - # Filter rows where 'count_data_sources' equals 'max_data_sources' - .filter(polars.col(count_data_sources_col) == polars.col(max_data_sources_col)) - # Keep only unique rows based on 'homepage_url' - .unique(subset=[homepage_url_col]) - ) - print("Indentifying agencies...") - # Add a new column 'hostname' by applying the parse_hostname function to 'url' - urls_df = urls_df.with_columns(polars.col("url").map_elements(parse_hostname).alias("hostname")) - - # Join urls_df with agencies_df on 'hostname' - matched_agencies_df = urls_df.join(agencies_df, on="hostname", how="left") - - # Replace all null values with an empty string - matched_agencies_clean_df = matched_agencies_df.with_columns(polars.all().fill_null("")) - except Exception as e: - print(f"An error occurred while processing the data: {e}") - raise e - return matched_agencies_clean_df - - -def read_data(file_path: str) -> polars.DataFrame: - try: - return polars.read_csv(file_path) - except Exception as e: - print(f"An error occurred while reading the file: {e}") - raise e - - -def write_data(df: polars.DataFrame, file_path: str): - try: - df.write_csv(file_path) - print("Results written to results.csv") - except Exception as e: - print(f"An error occurred while writing to the file: {e}") - raise e - - -def process_data(urls_df: polars.DataFrame) -> polars.DataFrame: - matched_agencies_df = match_urls_to_agencies_and_clean_data(urls_df) - - # Filter out rows where the hostname is not null - matches_only = matched_agencies_df.filter(polars.col("hostname").is_not_null()) - num_matches = len(matches_only) - num_urls = len(urls_df) - percent_urls_matched = 100 * float(num_matches) / float(num_urls) - - # Print the number and percentage of URLs that were matched - print(f"\n{num_matches} / {num_urls} ({percent_urls_matched:0.1f}%) of urls identified") - - # Return the DataFrame containing only the matched URLs - return matches_only - - -def process_and_write_data(input_file: str, output_file: str): - urls_df = read_data(input_file) - matches_only = process_data(urls_df) - if not matches_only.is_empty(): - write_data(matches_only, output_file) - - -if __name__ == "__main__": - process_and_write_data(sys.argv[1], "results.csv") - print("Results written to results.csv") diff --git a/api/routes/batch.py b/api/routes/batch.py index 2c791503..7ba0a2a4 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -103,7 +103,6 @@ async def get_batch_logs( @batch_router.post("/{batch_id}/abort") async def abort_batch( batch_id: int = Path(description="The batch id"), - core: SourceCollectorCore = Depends(get_core), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> MessageResponse: diff --git a/collector_db/enums.py b/collector_db/enums.py index a701a847..b28b6091 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -17,7 +17,6 @@ class ValidationStatus(PyEnum): class ValidationSource(PyEnum): MACHINE_LEARNING = "Machine Learning" - LABEL_STUDIO = "Label Studio" MANUAL = "Manual" diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index 099f5338..a842a9c0 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -72,17 +72,17 @@ async def handle_error(self, e: Exception) -> None: ) async def process(self) -> None: - await self.log("Processing collector...", allow_abort=False) + await self.log("Processing collector...") preprocessor = self.preprocessor() url_infos = preprocessor.preprocess(self.data) - await self.log(f"URLs processed: {len(url_infos)}", allow_abort=False) + await self.log(f"URLs processed: {len(url_infos)}") - await self.log("Inserting URLs...", allow_abort=False) + await self.log("Inserting URLs...") insert_urls_info: InsertURLsInfo = await self.adb_client.insert_urls( url_infos=url_infos, batch_id=self.batch_id ) - await self.log("Updating batch...", allow_abort=False) + await self.log("Updating batch...") await self.adb_client.update_batch_post_collection( batch_id=self.batch_id, total_url_count=insert_urls_info.total_count, @@ -91,7 +91,7 @@ async def process(self) -> None: batch_status=self.status, compute_time=self.compute_time ) - await self.log("Done processing collector.", allow_abort=False) + await self.log("Done processing collector.") if self.post_collection_function_trigger is not None: await self.post_collection_function_trigger.trigger_or_rerun() @@ -123,7 +123,6 @@ async def run(self) -> None: async def log( self, message: str, - allow_abort = True # Deprecated ) -> None: await self.logger.log(LogInfo( batch_id=self.batch_id, diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index 4516ceb5..6f05a3c4 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -9,8 +9,6 @@ class SourceCollectorCore: def __init__( self, - core_logger: Optional[Any] = None, # Deprecated - collector_manager: Optional[Any] = None, # Deprecated db_client: Optional[DatabaseClient] = None, dev_mode: bool = False ): diff --git a/core/TaskManager.py b/core/TaskManager.py index 429375c2..e72724fc 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,6 +1,6 @@ import logging -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType diff --git a/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py index 03f2a064..a6222cf8 100644 --- a/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py +++ b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py @@ -1,6 +1,6 @@ from typing import Optional -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponse, AgencyLookupResponseType +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponse, AgencyLookupResponseType from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.exceptions import MuckrockAPIError from core.helpers import process_match_agency_response_to_suggestions diff --git a/core/classes/task_operators/AgencyIdentificationTaskOperator.py b/core/classes/task_operators/AgencyIdentificationTaskOperator.py index 4c2d6f1b..b6e53955 100644 --- a/core/classes/task_operators/AgencyIdentificationTaskOperator.py +++ b/core/classes/task_operators/AgencyIdentificationTaskOperator.py @@ -1,6 +1,6 @@ from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType diff --git a/source_collectors/ckan/README.md b/source_collectors/ckan/README.md index be6c65cf..2afcbb28 100644 --- a/source_collectors/ckan/README.md +++ b/source_collectors/ckan/README.md @@ -19,28 +19,6 @@ Running the scraper will output a list of packages to a CSV file using the searc * `search_terms.py` - The search terms and CKAN portals to search from. * `ckan_scraper_toolkit.py` - Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals. -## Setup - -1. In a terminal, navigate to the CKAN scraper folder - ```cmd - cd scrapers_library/data_portals/ckan/ - ``` -2. Create and activate a Python virtual environment - ```cmd - python -m venv venv - source venv/bin/activate - ``` - -3. Install the requirements - ```cmd - pip install -r requirements.txt - ``` -4. Run the multi-portal CKAN scraper - ```cmd - python scrape_ckan_data_portals.py - ``` -5. Review the generated `results.csv` file. - ## How can I tell if a website I want to scrape is hosted using CKAN? There's no easy way to tell, some websites will reference CKAN or link back to the CKAN documentation while others will not. There doesn't seem to be a database of all CKAN instances either. diff --git a/source_collectors/ckan/main.py b/source_collectors/ckan/main.py deleted file mode 100644 index 091d2642..00000000 --- a/source_collectors/ckan/main.py +++ /dev/null @@ -1,44 +0,0 @@ -from source_collectors.ckan.ckan_scraper_toolkit import ckan_package_search, ckan_group_package_show, \ - ckan_package_search_from_organization -from source_collectors.ckan.scrape_ckan_data_portals import perform_search, get_flat_list, deduplicate_entries, \ - get_collection_child_packages, filter_result, parse_result, write_to_csv -from source_collectors.ckan.search_terms import package_search, group_search, organization_search - - - -async def main(): - """ - Main function. - """ - results = [] - - print("Gathering results...") - results = await perform_search( - search_func=ckan_package_search, - search_terms=package_search, - results=results, - ) - results = await perform_search( - search_func=ckan_group_package_show, - search_terms=group_search, - results=results, - ) - results = await perform_search( - search_func=ckan_package_search_from_organization, - search_terms=organization_search, - results=results, - ) - - flat_list = get_flat_list(results) - # Deduplicate entries - flat_list = deduplicate_entries(flat_list) - print("\nRetrieving collections...") - flat_list = get_collection_child_packages(flat_list) - - filtered_results = list(filter(filter_result, flat_list)) - parsed_results = list(map(parse_result, filtered_results)) - - write_to_csv(parsed_results) - -if __name__ == "__main__": - main() diff --git a/source_collectors/ckan/requirements.txt b/source_collectors/ckan/requirements.txt deleted file mode 100644 index fc41154b..00000000 --- a/source_collectors/ckan/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -from_root -ckanapi -bs4 -lxml -tqdm -pandas \ No newline at end of file diff --git a/source_collectors/ckan/schemas.py b/source_collectors/ckan/schemas.py deleted file mode 100644 index 6aeecf09..00000000 --- a/source_collectors/ckan/schemas.py +++ /dev/null @@ -1,6 +0,0 @@ -from marshmallow import Schema, fields - - -class PackageSearchSchema(Schema): - count = fields.Int(required=True) - results = fields.List(fields.Str(), required=True) # TODO: What is the structure of this? \ No newline at end of file diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index ad3d62e2..3a292b02 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -4,7 +4,6 @@ from itertools import chain from typing import Any, Callable, Optional -import pandas as pd from from_root import from_root from tqdm import tqdm @@ -41,26 +40,6 @@ async def perform_search( return results -async def get_collection_child_packages( - results: list[dict[str, Any]] -) -> list[dict[str, Any]]: - """Retrieves the child packages of each collection. - - :param results: List of results. - :return: List of results containing child packages. - """ - new_list = [] - - for result in tqdm(results): - if "extras" in result.keys(): - collections = await get_collections(result) - if collections: - new_list += collections[0] - continue - - new_list.append(result) - - return new_list async def get_collections(result): @@ -265,7 +244,3 @@ def deduplicate_entries(flat_list): return flat_list -def write_to_csv(parsed_results): - df = pd.DataFrame(parsed_results) - df.to_csv("results.csv") - diff --git a/source_collectors/common_crawler/README.md b/source_collectors/common_crawler/README.md deleted file mode 100644 index 3701b5d5..00000000 --- a/source_collectors/common_crawler/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Common Crawler - -This module interfaces with the Common Crawl dataset to extract urls. - -## Installation - -Python Version Required: 3.11 - -To install all necessary dependencies, run the following command from the root directory: - -```bash -pip install -r requirements.txt -``` - - -## Usage Example - -### Environment Requirements - -Please ensure you have a `.env` file located in the root directory (not the `common_crawler` directory) -which contains the following environment variable: - -* HUGGINGFACE_ACCESS_TOKEN = The access token to enable writing to the associated PDAP dataset. -To obtain your access token, consult user settings at -and ensure you have write access to . -* LABEL_STUDIO_ACCESS_TOKEN = The access token for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. -* LABEL_STUDIO_PROJECT_ID = The project ID for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL. - -### Instructions - -Run the following script from the root directory -```bash -python common_crawler/main.py CC-MAIN-2023-50 '*.gov' police --config common_crawler/config.ini --pages 2 -``` - -This example will crawl a single page (typically 15000 records) of the Common Crawl dataset with ID `CC-MAIN-2023-50` -and search for the term `police` in all the pages with the `.gov` domain. It will use the default configuration file `config.ini` -to determine the json cache location and the location of the output csv file. - -Note that the cache records the most recent page number that was used for given combination of Common Crawl ID, url search term, and keyword. -If the same command is run again, it will start from the next page. -If you want to reset the cache, you can use the `--reset-cache` flag. - -By default, the output csv file will be named `urls.csv` and will be located in the `data` directory of the module. -This csv file contains both the url and the parameters used to query it. - -### Parameters - -- **common_crawl_id**: Required. Specifies the Common Crawl Index to perform the search on. -- **url**: Required. Specifies the domain URL to query. Wildcard characters such as * can be used to expand the search. Note that the query must be contained within quotes (as in '*.gov') to prevent misinterpretation of wildcards -- **search_term**: Required. Specifies keyword within the url to search for. -- **-c or --config**: Optional. Specifies the configuration file to use. The default value is config.ini. -- **-p or --pages**: Optional. Specifies the number of pages to search. The default value is 1. -- **--reset-cache**: Optional. If set, it resets the cache before starting the crawl. - -### Configuration - -Several attributes are currently defined in `config.ini`: -- **cache_filename**: This is the name of the cache file. The default value is `cache`. The file will be saved with a `.json` extension. -- **output_filename**: This is the name of the output file. The default value is `urls`. The file will be saved with a `.csv` extension. -- **data_dir**: This is the directory where the cache and output files will be saved. The default value is `data`. -- **huggingface_repo_id**: This is the repository ID for the hugging face dataset which urls will be uploaded to - -## Code Structure - -The code is structured as follows: -- **main.py**: This is the main file that is used to run the module. It contains the logic to parse the command line arguments and call the necessary functions. -- **crawler.py**: This file contains the logic to interface with the Common Crawl dataset and extract urls. -- **cache.py**: This file contains the logic to read and write the cache file. -- **argparser.py**: This file contains the logic to parse the command line and config arguments. -- **csv_manager.py**: This file contains the logic to write the output csv file. -- **utils.py**: This file contains utility functions. -- **config.ini**: This file contains the default configuration values. -- **README.md**: This file contains the documentation for the module. You're reading it right now. Isn't that nifty! - -## Testing - -A suite of unit and integration tests were developed for this module. - -To run the tests, run the following command from this directory: - -```bash -pytest ../tests/test_common_crawler_integration.py -pytest ../tests/test_common_crawler_unit.py -``` \ No newline at end of file diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py deleted file mode 100644 index 67f4a290..00000000 --- a/source_collectors/common_crawler/argparser.py +++ /dev/null @@ -1,95 +0,0 @@ -import argparse -import configparser -import re - -""" -This module contains the argument parser for command line arguments -for the Common Crawler script. -""" - - -def valid_common_crawl_id(common_crawl_id: str) -> bool: - """ - Validate the Common Crawl ID format. - The Common Crawl ID should be in the format CC-MAIN-YYYY-WW. - Args: - common_crawl_id: The Common Crawl ID to validate - Returns: - True if the Common Crawl ID is valid, False otherwise - """ - return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None - - -def parse_args() -> argparse.Namespace: - """ - Parse the command line arguments for the Common Crawler script - as well as the configuration file. - Arguments parsed include: - - The Common Crawl ID - - The URL to query - - The search term - - The number of pages to search - - The configuration file (defaults to config.ini) - - A flag to reset the cache - Returns: The parsed arguments - """ - - parser = argparse.ArgumentParser( - description="Query the Common Crawl dataset and optionally save the results to a file." - ) - # Add the required arguments - parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") - parser.add_argument("url", type=str, help="The URL to query") - parser.add_argument("keyword", type=str, help="The keyword to search in the url") - # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument( - "-c", - "--config", - type=str, - default="config.ini", - help="The configuration file to use", - ) - parser.add_argument( - "-p", - "--pages", - type=int, - default=1, - help="The number of pages to search (default: 1)", - ) - parser.add_argument( - "--reset-cache", - action="store_true", - default=False, - help="Reset the cache before starting the crawl", - ) - - args = parser.parse_args() - - # Validate the Common Crawl ID format - if not valid_common_crawl_id(args.common_crawl_id): - parser.error( - "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." - ) - - # Read the configuration file - config = configparser.ConfigParser() - config.read(args.config) - - # Combine parsed arguments with configuration file defaults - app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config["DEFAULT"]) - - app_args = app_parser.parse_args() - - # Print arguments - print(f"--Common Crawl ID: {app_args.common_crawl_id}") - print(f"--URL: {app_args.url}") - print(f"--Keyword: {app_args.keyword}") - print(f"--Number of Pages: {app_args.pages}") - print(f"--Configuration File: {app_args.config}") - print(f"--Reset Cache: {app_args.reset_cache}") - print(f"--Output File: {app_args.output_filename}.csv") - print(f"--Cache File: {app_args.cache_filename}.json") - print(f"--Data Directory: {app_args.data_dir}") - - return app_args diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py deleted file mode 100644 index 23d58819..00000000 --- a/source_collectors/common_crawler/cache.py +++ /dev/null @@ -1,93 +0,0 @@ -import json - -from util.miscellaneous_functions import get_file_path - -""" -This module contains classes for managing a cache of Common Crawl search results -These classes include: - - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results -""" - - -class CommonCrawlerCacheManager: - """ - A class for managing the cache of Common Crawl search results. - This class is responsible for adding, retrieving, and saving cache data. - """ - - def __init__(self, file_name: str = "cache", directory=None): - """ - Initializes the CacheStorage object with a file name and directory. - Args: - file_name: the name of the cache file - directory: the directory to store the cache file - """ - self.file_path = get_file_path(f"{file_name}.json", directory) - print(f"Cache file path: {self.file_path}") - self.cache = self.load_or_create_cache() - - def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: - """ - Updates the cache with the last page crawled for a given index, url, and keyword. - Or adds a new cache object if it does not exist. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - last_page: the last page crawled - Returns: None - """ - if index not in self.cache: - self.cache[index] = {} - if url not in self.cache[index]: - self.cache[index][url] = {} - self.cache[index][url][keyword] = last_page - - def get(self, index, url, keyword) -> int: - """ - Retrieves a page number from the cache. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - - Returns: int - the last page crawled - - """ - if ( - index in self.cache - and url in self.cache[index] - and keyword in self.cache[index][url] - ): - return self.cache[index][url][keyword] - # The cache object does not exist. Return 0 as the default value. - return 0 - - def load_or_create_cache(self) -> dict: - """ - Loads the cache from the configured file path. - If the file does not exist, an empty dictionary is returned. - Returns: dict - the cache data - """ - try: - with open(self.file_path, "r") as file: - return json.load(file) - except FileNotFoundError: - return {} - - def save_cache(self) -> None: - """ - Converts the cache object into a JSON-serializable format and saves it to the configured file path. - This method ensures the cache is stored in a readable and easily reloadable format, allowing for - persistence of crawl data across sessions. - """ - # Reformat cache data for JSON serialization - with open(self.file_path, "w") as file: - json.dump(self.cache, file, indent=4) - - def reset_cache(self) -> None: - """ - Resets the cache to an empty state. - """ - self.cache = {} - print("Cache has been reset.") diff --git a/source_collectors/common_crawler/config.ini b/source_collectors/common_crawler/config.ini deleted file mode 100644 index fc558303..00000000 --- a/source_collectors/common_crawler/config.ini +++ /dev/null @@ -1,19 +0,0 @@ -# This configuration file contains default settings for the Common Crawler application. -# Settings can be modified to suit different environments or testing needs. - -[DEFAULT] -# Filename for the cache. Stores which pages have been crawled -# at which combinations of index, url search term, and keyword -# to avoid re-crawling them. -cache_filename = cache - -# Directory where data files (both cache and output) are stored. -# Change as needed for different environments. -# Path is relative from working directory that executes common_crawler/main.py -data_dir = common_crawler/data - -# Filename for the output CSV containing crawled URLs. -output_filename = urls - -# Name of the huggingface repo -huggingface_repo_id = PDAP/unlabeled-urls \ No newline at end of file diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py deleted file mode 100644 index 5a80aeaa..00000000 --- a/source_collectors/common_crawler/csv_manager.py +++ /dev/null @@ -1,79 +0,0 @@ -import csv -import os - -from util.miscellaneous_functions import get_file_path - - -class CSVManager: - """ - Manages a CSV file for storing URLs. - Creates the file if it doesn't exist, and provides a method for adding new rows. - """ - - def __init__(self, file_name: str, headers: list[str], directory=None): - """ - Args: - file_name: the name of the CSV file - headers: the headers for the CSV file - directory: the directory to store the CSV file - """ - self.file_path = get_file_path(f"{file_name}.csv", directory) - self.headers = headers - if not os.path.exists(self.file_path): - self.initialize_file() - - def add_row(self, row_values: list[str] | tuple[str]): - """ - Appends a new row of data to the CSV. - Args: - row_values: list of values to add to the csv, in order of their inclusion in the list - """ - if isinstance(row_values, str): - # Single values must be converted to a list format - row_values = [row_values] - try: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(row_values) - except Exception as e: - print(f"An error occurred while trying to write to {self.file_path}: {e}") - - def add_rows(self, results: list[list[str]]) -> None: - """ - Appends multiple rows of data to the CSV as a list of lists of strings. - Args: - results: list[list[str] - a list of lists of strings, each inner list representing a row - Returns: None - """ - for result in results: - self.add_row(result) - print(f"{len(results)} URLs written to {self.file_path}") - - def initialize_file(self): - """ - Initializes the CSV file. - If the file doesn't exist, it creates it with the header row. - """ - # check if file exists - file_exists = os.path.isfile(self.file_path) - - if not file_exists: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(self.headers) - else: - # Open and check that headers match - with open(self.file_path, mode="r", encoding="utf-8") as file: - header_row = next(csv.reader(file)) - if header_row != self.headers: - raise ValueError( - f"Header row in {self.file_path} does not match expected headers" - ) - print(f"CSV file initialized at {self.file_path}") - - def delete_file(self): - """ - Deletes the CSV file. - """ - os.remove(self.file_path) - print(f"CSV file deleted at {self.file_path}") diff --git a/source_collectors/common_crawler/data/cache.json b/source_collectors/common_crawler/data/cache.json deleted file mode 100644 index e12687ad..00000000 --- a/source_collectors/common_crawler/data/cache.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "CC-MAIN-2023-50": { - "*.gov": { - "police": 10 - } - } -} \ No newline at end of file diff --git a/source_collectors/common_crawler/data/urls.csv b/source_collectors/common_crawler/data/urls.csv deleted file mode 100644 index 6fc4dc6f..00000000 --- a/source_collectors/common_crawler/data/urls.csv +++ /dev/null @@ -1,207 +0,0 @@ -Index,Search Term,Keyword,Page,URL -CC-MAIN-2023-50,*.gov,police,2,https://acworth-ga.gov/administering-the-oath-of-office-to-a-newly-promoted-member-of-the-police-department/ -CC-MAIN-2023-50,*.gov,police,2,https://www.ada.gov/policevideo/policebroadbandgallery.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/franklintonpolice.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/illinois_state_police.htm -CC-MAIN-2023-50,*.gov,police,2,https://www.adamn.gov/p/other/police-department -CC-MAIN-2023-50,*.gov,police,2,https://www.adamscountypa.gov/police/earpd -CC-MAIN-2023-50,*.gov,police,2,https://www.aftonwyoming.gov/government/police_department/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://adeca.alabama.gov/2022/11/14/gov-ivey-announces-grant-to-help-auburn-police-deter-crime/ -CC-MAIN-2023-50,*.gov,police,7,https://governor.alabama.gov/newsroom/2020/02/kimberly-police-officer-nick-orear-flag-memo/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/de/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/police-jurisdictions/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ru/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2015-police-jurisdiction-annexations-deannexations-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2023-police-jurisdiction-deannexations-ordinances-and-maps/ -CC-MAIN-2023-50,*.gov,police,8,https://tourism.alabama.gov/tag/world-police-and-fire-games/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/v/237/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/sharedassets/public/alameda/police/policy-manual.pdf -CC-MAIN-2023-50,*.gov,police,8,http://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/departments/police/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/news/stories/peace-officers-memorial-day-and-national-police-week/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/public-safety/police/police-blotter/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/index.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/jobs/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/faiiap/police-fire/index.shtml -CC-MAIN-2023-50,*.gov,police,10,https://gov.alaska.gov/a-proclamation-on-honoring-united-states-capitol-police-officers/ -CC-MAIN-2023-50,*.gov,police,10,https://geohub.albanyga.gov/datasets/corrected-police-beat -CC-MAIN-2023-50,*.gov,police,10,https://data.albanyny.gov/browse?tags=police+report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/contact-the-albany-police-department -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/hr/salary-schedules/police-table -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/apba/scholarship_packet.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/a18_alarm_user_permit_application.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/secondhand_dealer.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/Solicitor_License.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/neighborhood-watch/2013_nw_brochure-update.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/property/propertyinventoryrecord-fillable.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/child_safety_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/facebook_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/linkedln_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/photosharingservices_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/smartphone_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/twitter_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/ -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/administration -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/history -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/records-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/cold-cases -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/property-inventory-record -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/animal-control -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/apba -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/community-police-academy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-watch-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safereturn -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/youthacademy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/qrcode -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/robots.txt -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/child-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/online-social-media-safety-tips -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/protecting-your-business -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safe-exchange-zones -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safety-on-the-road -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/vehicle -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/cadet-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/career-opportunities -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/lateral-officers -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-02-22/alexandria-police-department-makes-arrest-in-connection-to-shots-fired-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-03-15/alexandria-police-department-apprehends-assault-suspect -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-22/alexandria-police-officer-arrested -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-25/alexandria-police-department-investigates-first-homicide-of-the-year -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-04-18/don-hayes-appointed-alexandria-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-06-06/alexandria-police-makes-arrest-in-fatal-shooting -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-08-29/alexandria-police-department-investigates-serious-crash -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-12-21/alexandria-police-department-investigates-shooting-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-09-29/apd-lt-graduates-from-dc-police-leadership-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/community-police-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/criminal-investigation-division -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/listing-page/apd-news-releases -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/office-of-the-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/other-services -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/police-services -CC-MAIN-2023-50,*.gov,police,11,http://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=112991 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/default.aspx?id=24274 -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=59358 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=27648 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=33624 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=68136 -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-3030.aspx -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-4122.aspx -CC-MAIN-2023-50,*.gov,police,11,https://aliquippapa.gov/events/light-up-night-at-the-aliquippa-police-station/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almaarkansas.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almontmichigan.gov/departments/police-department/ -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/contact-forms/departments/police/report-an-abandoned-vehicle-on-public-streets -CC-MAIN-2023-50,*.gov,police,11,https://www.altoonapa.gov/contacts/police/commander-of-criminal-investigation/lt-ashley-day -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/animal-control -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/directory -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/services -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-documents/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-staff/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/how-do-i-file-a-police-report-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/who-do-i-call-about-police-related-non-emergencies-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/topics/police-courts/ -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,https://share.america.gov/ar/heres-police-held-accountable-shooting-incidents-video/ diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py deleted file mode 100644 index 67bd4c45..00000000 --- a/source_collectors/common_crawler/main.py +++ /dev/null @@ -1,366 +0,0 @@ -import argparse -import collections -import dataclasses -import os -import re -import sys -from datetime import datetime - -from dotenv import load_dotenv - -from source_collectors.common_crawler.argparser import parse_args -from source_collectors.common_crawler.cache import CommonCrawlerCacheManager -from source_collectors.common_crawler.crawler import CommonCrawlResult, CommonCrawlerManager -from source_collectors.common_crawler.csv_manager import CSVManager - -# The below code sets the working directory to be the root of the entire repository -# This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) - -from util.huggingface_api_manager import HuggingFaceAPIManager -from util.miscellaneous_functions import get_filename_friendly_timestamp -from label_studio_interface.LabelStudioConfig import LabelStudioConfig -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager - -""" -This module contains the main function for the Common Crawler script. -""" - - -@dataclasses.dataclass -class BatchInfo: - """ - Dataclass for batch info - """ - datetime: str - source: str - count: str - keywords: str - notes: str - filename: str - - -class LabelStudioError(Exception): - """Custom exception for Label Studio Errors""" - - pass - - -BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] - - -def get_current_time(): - """ - Returns the current time - """ - return str(datetime.now()) - - -def add_batch_info_to_csv( - common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int -) -> BatchInfo: - """ - Adds batch info to CSV - """ - batch_info = BatchInfo( - datetime=get_current_time(), - source="Common Crawl", - count=str(len(common_crawl_result.url_results)), - keywords=f"{args.url} - {args.keyword}", - notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", - ) - - batch_info_csv_manager = CSVManager( - file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS - ) - batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) - - return batch_info - - -def main(): - """ - Main function - """ - # Parse the arguments - args = parse_args() - - # Initialize the Cache - cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, directory=args.data_dir - ) - - load_dotenv() - - # Initialize the HuggingFace API Manager - hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") - if not hf_access_token: - raise ValueError( - "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory." - ) - huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, repo_id=args.huggingface_repo_id - ) - ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") - if not ls_access_token: - raise ValueError( - "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory." - ) - ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") - if not ls_project_id: - raise ValueError( - "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " - "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory." - ) - - try: - print("Retrieving Label Studio data for deduplication") - label_studio_results = get_ls_data() - if label_studio_results is None: - raise LabelStudioError("Failed to retrieve Label Studio Data") - print("Label Studio data retrieved successfully") - except LabelStudioError as e: - print(e) - raise - - if args.reset_cache: - cache_manager.reset_cache() - - try: - # Retrieve the last page from the cache, or 0 if it does not exist - last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload( - args, last_page, huggingface_api_manager, label_studio_results - ) - except ValueError as e: - print(f"Error during crawling: {e}") - return - - try: - cache_manager.upsert( - index=args.common_crawl_id, - url=args.url, - keyword=args.keyword, - last_page=common_crawl_result.last_page_search, - ) - cache_manager.save_cache() - - except ValueError as e: - print(f"Error while saving cache manager: {e}") - - -def handle_remote_results_error(remote_results): - """ - Handles errors in the remote results - - Args: remote_results (dict): The results from the label studio project - Raises: LabelStudioError: If an error is found in the remote results - """ - - status_code = remote_results.get("status_code") - if status_code == 401: - raise LabelStudioError("Invalid Label Studio token passed! Exiting...") - elif status_code == 404: - raise LabelStudioError("Invalid Label Studio Project ID! Exiting...") - else: - raise LabelStudioError(f"Unexpected error: {remote_results}") - - -def validate_remote_results(remote_results): - """ - Validates the remote results retrieved from the Label Studio project - - Args: remote_results (dict or list): The results from the Label Studio project - - Returns: - list[dict]: If the remote results are valid - None: If the remote results are invalid - """ - if isinstance(remote_results, list): - if not remote_results: - print("No data in Label Studio project.") - return [] - elif "url" not in remote_results[0]["data"]: - raise LabelStudioError( - "Column 'url' not present in Label Studio project. Exiting..." - ) - else: - return remote_results - elif isinstance(remote_results, dict): - handle_remote_results_error(remote_results) - else: - raise LabelStudioError("Unexpected response type.") - - -def get_ls_data() -> list[dict] | None: - """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. - - Returns: - list[dict] | None: Data from the Labels Studio project or None if the result is invalid. - """ - # Retrieve the data from the Labels Studio project - config = LabelStudioConfig() - api_manager = LabelStudioAPIManager(config) - response = api_manager.import_tasks_from_project(all_tasks=True) - remote_results = response.json() - - return validate_remote_results(remote_results) - - -def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. - - Args: - url (str): The URL to strip. - - Returns: - str: The stripped URL. - """ - result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) - return result - - -def remove_local_duplicates(url_results: list[str]) -> list[str]: - """Removes duplicate URLs from a list, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - - Returns: - list[str]: List of unique URLs. - """ - stripped_url_results = [strip_url(url) for url in url_results] - unique_urls = collections.deque() - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in unique_urls: - del url_results[index - adjust] - adjust += 1 - else: - unique_urls.appendleft(url) - - return url_results - - -def remove_remote_duplicates( - url_results: list[str], label_studio_data: list[dict] -) -> list[str]: - """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - label_studio_data (list[dict]): Label Studio project data to check for duplicates. - - Returns: - list[str]: List of remaining URLs not present in the Label Studio project. - """ - try: - remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] - except TypeError: - print( - "Invalid Label Studio credentials. Database could not be checked for duplicates." - ) - return url_results - remote_urls = set(remote_urls) - - stripped_url_results = [strip_url(url) for url in url_results] - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in remote_urls: - del url_results[index - adjust] - adjust += 1 - - return url_results - - -def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int, -): - """ - Handles the CSV file and uploads it to Hugging Face repository. - Args: - common_crawl_result: The result from Common Crawl. - huggingface_api_manager: The Hugging Face API manager. - args: The command-line arguments. - last_page: last page crawled - - """ - batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) - - csv_manager = CSVManager( - file_name=batch_info.filename, headers=["url"], directory=args.data_dir - ) - csv_manager.add_rows(common_crawl_result.url_results) - huggingface_api_manager.upload_file( - local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", - ) - print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" - ) - csv_manager.delete_file() - - -def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict], -) -> CommonCrawlResult: - """ - Processes a crawl and uploads the results to Hugging Face. - """ - # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager(args.common_crawl_id) - # Determine the pages to search, based on the last page searched - start_page = last_page + 1 - # Use the parsed arguments - common_crawl_result: CommonCrawlResult = crawler_manager.crawl( - search_term=args.url, - keyword=args.keyword, - num_pages=args.pages, - start_page=start_page, - ) - # Logic should conclude here if no results are found - if not common_crawl_result.url_results: - print("No url results found. Ceasing main execution.") - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates( - common_crawl_result.url_results - ) - common_crawl_result.url_results = remove_remote_duplicates( - common_crawl_result.url_results, label_studio_data - ) - if not common_crawl_result.url_results: - print( - "No urls not already present in the database found. Ceasing main execution." - ) - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - handle_csv_and_upload(common_crawl_result, huggingface_api_manager, args, last_page) - - return common_crawl_result - - -if __name__ == "__main__": - # Example usage: python main.py CC-MAIN-2023-50 *.gov "police" - # Usage with optional arguments: python main.py CC-MAIN-2023-50 *.gov "police" -p 2 -o police_urls.txt - print("Running Common Crawler...") - main() diff --git a/source_collectors/common_crawler/requirements_common_crawler_action.txt b/source_collectors/common_crawler/requirements_common_crawler_action.txt deleted file mode 100644 index 22823fd0..00000000 --- a/source_collectors/common_crawler/requirements_common_crawler_action.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests~=2.31.0 -python-dotenv~=1.0.1 -huggingface-hub~=0.22.2 \ No newline at end of file diff --git a/source_collectors/common_crawler/schemas.py b/source_collectors/common_crawler/schemas.py deleted file mode 100644 index 608f9632..00000000 --- a/source_collectors/common_crawler/schemas.py +++ /dev/null @@ -1,22 +0,0 @@ -from marshmallow import Schema, fields - - -class CommonCrawlerConfigSchema(Schema): - common_crawl_id = fields.String( - required=True, - description="The Common Crawl ID", - example="CC-MAIN-2022-10" - ) - url = fields.String(required=True, description="The URL to query", example="*.gov") - keyword = fields.String(required=True, description="The keyword to search in the url", example="police") - start_page = fields.Integer(required=False, description="The page to start from", example=1) - pages = fields.Integer(required=False, description="The number of pages to search", example=1) - -class CommonCrawlerOutputSchema(Schema): - urls = fields.List( - fields.String( - required=True - ), - required=True, - description="The list of URLs found in the search" - ) \ No newline at end of file diff --git a/agency_identifier/MuckrockAPIInterface.py b/source_collectors/muckrock/MuckrockAPIInterface.py similarity index 100% rename from agency_identifier/MuckrockAPIInterface.py rename to source_collectors/muckrock/MuckrockAPIInterface.py diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md index 43bae80d..a7e75b71 100644 --- a/source_collectors/muckrock/README.md +++ b/source_collectors/muckrock/README.md @@ -4,85 +4,3 @@ This repo provides tools for searching Muckrock FOIA requests, it includes scripts for downloading data from MuckRock, generating CSV files per PDAP database requirements, and automatic labeling -## Installation - -### 1. Clone the `scrapers` repository and navigate to the `muckrock_tools` directory. - -``` -git clone git@github.com:Police-Data-Accessibility-Project/scrapers.git -cd scrapers/scrapers_library/data_portals/muckrock/muckrock_tools -``` - -### 2. Create a virtual environment. - -If you don't already have virtualenv, install the package: - -``` - -pip install virtualenv - -``` - -Then run the following command to create a virtual environment (ensure the python version is as below): - -``` - -virtualenv -p python3.12 venv - -``` - -### 3. Activate the virtual environment. - -``` - -source venv/bin/activate - -``` - -### 4. Install dependencies. - -``` - -pip install -r requirements.txt - -``` - -## Uses - -### 1. Simple Search Term - -- `muck_get.py` -- script to perform searches on MuckRock's database, by matching a search string to title of request. Search is slow due to rate limiting (cannot multi thread around it). - -### 2. Clone Muckrock database & search locally - -- scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present) - -- `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference. - -- After `foia_data.db` is created, run `search_foia_data_db.py`, which receives a search string as input and outputs a JSON file with all related FOIA requests for later processing by `generate_detailed_muckrock_csv.py`. For example, - -``` -python3 create_foia_data_db.py - -python3 search_foia_data_db.py --search_for "use of force" -``` - -produces 'use_of_force.json'. - -### 3. County Level Search - -- `get_allegheny_foias.py`, `allegheny_county_towns.txt` -- To search for any and all requests in a certain county (e.g. Allegheny in this case) you must provide a list of all municipalities contained within the county. Muckrock stores geographic info in tiers, from Federal, State, and local level. At the local level, e.g. Pittsburgh and Allegheny County are in the same tier, with no way to determine which municipalities reside within a county (without providing it yourself). - -The `get_allegheny_foias.py` script will find the jurisdiction ID for each municipality in `allegheny_county_towns.txt`, then find all completed FOIA requests for those jurisdictions. - -### 4. Generate detailed FOIA data in PDAP database format - -- `generate_detailed_muckrock_csv.py` -- Once you have a json of relevant FOIA's, run it through this script to generate a CSV that fulfills PDAP database requirements. - -### 5. ML Labeling - -- `muckrock_ml_labeler.py` -- A tool for auto labeling MuckRock sources. This script is using [fine-url-classifier](https://huggingface.co/PDAP/fine-url-classifier) to assign 1 of 36 record type labels. At present, script is expecting each source to have associated header tags, provided via `html-tag-collector/collector.py`. (TODO: For muckrock sources, `collector.py` insufficient, does not grab main text of the request) diff --git a/source_collectors/muckrock/classes/SQLiteClient.py b/source_collectors/muckrock/classes/SQLiteClient.py deleted file mode 100644 index 96a59d82..00000000 --- a/source_collectors/muckrock/classes/SQLiteClient.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -import sqlite3 - - -class SQLClientError(Exception): - pass - - -class SQLiteClient: - - def __init__(self, db_path: str) -> None: - self.conn = sqlite3.connect(db_path) - - def execute_query(self, query: str, many=None): - - try: - if many is not None: - self.conn.executemany(query, many) - else: - self.conn.execute(query) - self.conn.commit() - except sqlite3.Error as e: - print(f"SQLite error: {e}") - error_msg = f"Failed to execute query due to SQLite error: {e}" - logging.error(error_msg) - self.conn.rollback() - raise SQLClientError(error_msg) - -class SQLiteClientContextManager: - - def __init__(self, db_path: str) -> None: - self.client = SQLiteClient(db_path) - - def __enter__(self): - return self.client - - def __exit__(self, exc_type, exc_value, traceback): - self.client.conn.close() \ No newline at end of file diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py deleted file mode 100644 index b958b61c..00000000 --- a/source_collectors/muckrock/muck_get.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -A straightforward standalone script for downloading data from MuckRock -and searching for it with a specific search string. -""" -from source_collectors.muckrock.classes.FOIASearcher import FOIASearcher -from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher -from source_collectors.muckrock.utils import save_json_file - -if __name__ == "__main__": - search_term = "use of force" - fetcher = FOIAFetcher() - searcher = FOIASearcher(fetcher=fetcher, search_term=search_term) - results = searcher.search_to_count(20) - json_out_file = search_term.replace(" ", "_") + ".json" - save_json_file(file_path=json_out_file, data=results) - print(f"List dumped into {json_out_file}") diff --git a/source_collectors/muckrock/requirements.txt b/source_collectors/muckrock/requirements.txt deleted file mode 100644 index babb4f3e..00000000 --- a/source_collectors/muckrock/requirements.txt +++ /dev/null @@ -1,30 +0,0 @@ -certifi==2024.8.30 -charset-normalizer==3.4.0 -filelock==3.16.1 -fsspec==2024.10.0 -huggingface-hub==0.26.1 -idna==3.10 -Jinja2==3.1.4 -logging==0.4.9.6 -MarkupSafe==3.0.2 -mpmath==1.3.0 -networkx==3.4.2 -numpy==2.1.2 -packaging==24.1 -pandas==2.2.3 -python-dateutil==2.9.0.post0 -pytz==2024.2 -PyYAML==6.0.2 -regex==2024.9.11 -requests==2.32.3 -safetensors==0.4.5 -setuptools==75.2.0 -six==1.16.0 -sympy==1.13.1 -tokenizers==0.20.1 -torch==2.5.0 -tqdm==4.66.5 -transformers==4.46.0 -typing_extensions==4.12.2 -tzdata==2024.2 -urllib3==2.2.3 diff --git a/tests/manual/agency_identifier/test_muckrock_api_interface.py b/tests/manual/agency_identifier/test_muckrock_api_interface.py index 2dac6bd4..e3a86ed9 100644 --- a/tests/manual/agency_identifier/test_muckrock_api_interface.py +++ b/tests/manual/agency_identifier/test_muckrock_api_interface.py @@ -1,7 +1,7 @@ import pytest from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface @pytest.mark.asyncio diff --git a/tests/manual/unsorted/test_identifier_unit.py b/tests/manual/unsorted/test_identifier_unit.py deleted file mode 100644 index a6dcc1fb..00000000 --- a/tests/manual/unsorted/test_identifier_unit.py +++ /dev/null @@ -1,275 +0,0 @@ -import tempfile -from unittest.mock import patch - -import pytest -import requests_mock - -from agency_identifier.identifier import * - - -@pytest.fixture -def mock_env(monkeypatch): - monkeypatch.setenv("VUE_APP_PDAP_API_KEY", "test_api_key") - - -def test_get_page_data_success(mock_env): - with requests_mock.Mocker() as m: - m.get("https://data-sources.pdap.io/api/agencies/1", json={"data": "test_data"}, status_code=200) - data = get_page_data(1) - assert data == "test_data" - - -def test_get_page_data_failure(mock_env): - with requests_mock.Mocker() as m: - m.get("https://data-sources.pdap.io/api/agencies/1", status_code=404) - with pytest.raises(Exception): - get_page_data(1) - - -@pytest.mark.parametrize("url,expected", [ - ("http://www.example.com", "example.com"), - ("https://example.com", "example.com"), - ("example.com", "example.com"), - ("www.example.com", "example.com"), -]) -def test_parse_hostname(url, expected): - assert parse_hostname(url) == expected - - -@pytest.mark.parametrize("url", [ - "http:///www.example.com", # Invalid URL - "://example.com", # Missing scheme -]) -def test_parse_hostname_failure(url): - with pytest.raises(Exception): - parse_hostname(url) - - -@pytest.mark.parametrize("url,expected", [ - ("http://www.example.com", "example.com/"), - ("https://example.com", "example.com/"), - ("http://example.com/path/to/page", "example.com/path/to/page/"), - ("www.example.com", "example.com/"), - ("example.com/", "example.com/"), -]) -def test_remove_http(url, expected): - assert remove_http(url) == expected - - -@pytest.fixture -def agencies_and_hostnames(): - return ( - [{"name": "Agency 1", "homepage_url": "https://agency1.com"}], - ["agency1.com"] - ) - - -def test_match_agencies_found(agencies_and_hostnames): - agencies, agency_hostnames = agencies_and_hostnames - match = match_agencies(agencies, agency_hostnames, "http://www.agency1.com/page") - assert match["status"] == "Match found" - assert match["agency"]["name"] == "Agency 1" - - -def test_match_agencies_no_match(agencies_and_hostnames): - agencies, agency_hostnames = agencies_and_hostnames - match = match_agencies(agencies, agency_hostnames, "http://www.nonexistentagency.com") - assert match["status"] == "No match found" - assert match["agency"] == [] - -@pytest.fixture -def agencies_with_same_hostname(): - return ( - [ - {"name": "Agency 1", "homepage_url": "http://agency.com/path1"}, - {"name": "Agency 2", "homepage_url": "http://agency.com/path2"} - ], - ["agency.com", "agency.com"] - ) - -def test_match_agencies_multiple_found(agencies_with_same_hostname): - agencies, agency_hostnames = agencies_with_same_hostname - # A URL that matches the first agency more closely - match = match_agencies(agencies, agency_hostnames, "http://agency.com/path1/page") - assert match["status"] == "Match found" - assert match["agency"]["name"] == "Agency 1" - - # A URL that doesn't closely match either agency's homepage URL path - contested_match = match_agencies(agencies, agency_hostnames, "http://agency.com/otherpath/page") - assert contested_match["status"] == "Contested match" - assert contested_match["agency"] == [] - - # A URL that matches the second agency more closely - match_second = match_agencies(agencies, agency_hostnames, "http://agency.com/path2/anotherpage") - assert match_second["status"] == "Match found" - assert match_second["agency"]["name"] == "Agency 2" - -@patch('agency_identifier.identifier.get_page_data') -def test_get_agencies_data(mock_get_page_data, mock_env): - # Mock get_page_data to return a dictionary on the first call and an empty dictionary on the second call - mock_get_page_data.side_effect = [ - [{"name": "Agency 1", "homepage_url": "https://agency1.com", "id": "1"}], # First page data - [] # Indicates no more pages - ] - - df = get_agencies_data() - assert not df.is_empty() - assert len(df) == 1 - assert df["name"][0] == "Agency 1" - assert df["homepage_url"][0] == "https://agency1.com" - - -# Sample data to simulate what `match_urls_to_agencies_and_clean_data` might return -sample_agencies_data = polars.DataFrame({ - "url": ["http://agency1.com", "http://agency2.com", "http://nonexistentagency.com"], - "homepage_url": ["http://agency1.com", "http://agency2.com", None], - "hostname": ["agency1.com", "agency2.com", None], -}) - -# Sample input URLs DataFrame -sample_urls_df = polars.DataFrame({ - "url": ["http://agency1.com/page1", "http://agency2.com/page2", "http://nonexistentagency.com/page"] -}) - - -@pytest.fixture -def mock_match_urls_to_agencies_and_clean_data(): - with patch('agency_identifier.identifier.match_urls_to_agencies_and_clean_data') as mock: - mock.return_value = sample_agencies_data - yield mock - - -def test_process_data(mock_match_urls_to_agencies_and_clean_data): - processed_df = process_data(sample_urls_df) - - # Verify that the mock was called once with the sample_urls_df - mock_match_urls_to_agencies_and_clean_data.assert_called_once_with(sample_urls_df) - - # Check that the processed DataFrame has filtered out the unmatched URLs - assert len(processed_df) == 2 # Expecting only matched URLs to be present - - # Check if the 'hostname' column exists and has no null values in the result - assert "hostname" in processed_df.columns - assert processed_df.filter(polars.col("hostname").is_null()).height == 0 - - # You might also want to check specific values if necessary - assert processed_df["url"].to_list() == ["http://agency1.com", "http://agency2.com"] - - -# Sample data to simulate what `get_agencies_data` might return -sample_get_agencies_data = polars.DataFrame({ - "homepage_url": ["http://agency1.com", "http://agency2.com"], - "name": ["Agency 1", "Agency 2"], - "count_data_sources": [10, 15], - "hostname": ["agency1.com", "agency2.com"], # Assume this is added by the function -}) - - -@pytest.fixture -def mock_get_agencies_data(): - with patch('agency_identifier.identifier.get_agencies_data') as mock: - mock.return_value = sample_get_agencies_data - yield mock - - -def test_match_urls_to_agencies_and_clean_data(mock_get_agencies_data): - matched_df = match_urls_to_agencies_and_clean_data(sample_urls_df) - - # Verify that `get_agencies_data` was called - mock_get_agencies_data.assert_called_once() - - # Verify the structure and content of the matched DataFrame - # Expect that each URL is matched with the correct agency based on the hostname - # Additionally, check for the addition of any new columns or transformations you apply - assert "homepage_url" in matched_df.columns - assert len(matched_df) == len(sample_urls_df) # Ensure all URLs are processed - - # Verify that URLs are correctly matched or not matched to agencies - # This assumes that the function annotates the DataFrame with match results - assert matched_df.filter(polars.col("url") == "http://agency1.com/page1").select("name")["name"][0] == "Agency 1" - assert matched_df.filter(polars.col("url") == "http://nonexistentagency.com/page").select("name")["name"][0] == "" - - -def test_read_data_success(): - # Create a temporary file with some CSV content - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: - tmp.write("column1,column2\nvalue1,value2") - tmp_path = tmp.name - - # Attempt to read the file with read_data - try: - df = read_data(tmp_path) - assert not df.is_empty() - assert "column1" in df.columns - assert df.shape == (1, 2) - finally: - # Clean up the temporary file - os.remove(tmp_path) - -def test_read_data_failure(): - # Test reading a non-existent file should raise an exception - with pytest.raises(Exception): - read_data("non_existent_file.csv") - - -def test_write_data_success(): - # Create a DataFrame to write - df = polars.DataFrame({"column1": ["value1"], "column2": ["value2"]}) - - # Use a temporary file to write the DataFrame - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: - tmp_path = tmp.name - - # Write the DataFrame and verify the file contents - try: - write_data(df, tmp_path) - - # Read back the file to verify contents - with open(tmp_path, 'r') as f: - content = f.read() - assert "column1,column2" in content - assert "value1,value2" in content - finally: - # Clean up the temporary file - os.remove(tmp_path) - - -def test_write_data_failure(monkeypatch): - # Simulate an error by patching the `write_csv` method to raise an exception - with monkeypatch.context() as m: - m.setattr(polars.DataFrame, "write_csv", - lambda self, file_path: (_ for _ in ()).throw(Exception("Mock write failure"))) - with pytest.raises(Exception) as exc_info: - df = polars.DataFrame({"column1": ["value1"], "column2": ["value2"]}) - write_data(df, "path/to/non_writable_directory/file.csv") - assert "Mock write failure" in str(exc_info.value) - -@patch('agency_identifier.identifier.write_data') -@patch('agency_identifier.identifier.process_data') -@patch('agency_identifier.identifier.read_data') -def test_process_and_write_data_success(mock_read_data, mock_process_data, mock_write_data): - # Setup mock return values - mock_read_data.return_value = polars.DataFrame({"url": ["http://example.com"]}) - processed_df = polars.DataFrame({"url": ["http://example.com"], "processed": [True]}) - mock_process_data.return_value = processed_df - - # Call the function with mocked input and output file paths - process_and_write_data("input_file.csv", "output_file.csv") - - # Verify that read_data and write_data were called correctly - mock_read_data.assert_called_once_with("input_file.csv") - mock_process_data.assert_called_once_with(mock_read_data.return_value) - mock_write_data.assert_called_once_with(processed_df, "output_file.csv") - -@pytest.mark.parametrize("side_effect,expected_exception", [ - (FileNotFoundError, FileNotFoundError), - (PermissionError, PermissionError), -]) -@patch('agency_identifier.identifier.write_data') -@patch('agency_identifier.identifier.process_data') -@patch('agency_identifier.identifier.read_data') -def test_process_and_write_data_failure(mock_read_data, mock_process_data, mock_write_data, side_effect, expected_exception): - mock_read_data.side_effect = side_effect - - with pytest.raises(expected_exception): - process_and_write_data("input_file.csv", "output_file.csv") \ No newline at end of file diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index ae34b28e..1dc05b44 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -21,7 +21,6 @@ class APITestHelper: async_core: AsyncCore db_data_creator: DBDataCreator mock_huggingface_interface: MagicMock - mock_label_studio_interface: MagicMock def adb_client(self): return self.db_data_creator.adb_client @@ -71,6 +70,5 @@ async def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> A async_core=client.app.state.async_core, db_data_creator=db_data_creator, mock_huggingface_interface=MagicMock(), - mock_label_studio_interface=MagicMock() ) await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 8fb9f4a5..cd9556cb 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -5,7 +5,7 @@ import pytest from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse from collector_db.models import Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome From c239567f3905b270dcb7d771d42acc9284653348 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 16:44:02 -0400 Subject: [PATCH 118/182] fix(build): remove nonexistent directory from dockerfile --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index dfcb1392..6718a121 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,6 @@ RUN playwright install chromium RUN playwright install-deps chromium # Copy project files -COPY agency_identifier ./agency_identifier COPY api ./api COPY collector_db ./collector_db COPY collector_manager ./collector_manager From 333baf58e42da3861b719b63d668450f366ca970 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 16:57:37 -0400 Subject: [PATCH 119/182] docs(api): Change `/docs` to `/api` for API display Additionally, add redirect at `/docs` endpoint to redirect user to `/api` route. --- api/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/api/main.py b/api/main.py index 40970e4f..c993b941 100644 --- a/api/main.py +++ b/api/main.py @@ -3,6 +3,7 @@ import aiohttp import uvicorn from fastapi import FastAPI +from starlette.responses import RedirectResponse from api.routes.annotate import annotate_router from api.routes.batch import batch_router @@ -110,10 +111,15 @@ async def setup_database(db_client): app = FastAPI( title="Source Collector API", description="API for collecting data sources", + docs_url='/api', version="0.1.0", lifespan=lifespan ) +@app.get("/docs", include_in_schema=False) +async def redirect_docs(): + return RedirectResponse(url="/api") + routers = [ root_router, From 1830b16a05d0da8d99ee7c91d8c8a27f3ba9e435 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 18 Apr 2025 07:55:52 -0400 Subject: [PATCH 120/182] refactor(app): consolidate environment variable usage Consolidate environment variable usage through a centralized class, to enable easy mocking and tracking of environment variables. --- source_collectors/auto_googler/AutoGooglerCollector.py | 9 +++++---- .../integration/source_collectors/__init__.py | 0 .../source_collectors/test_example_collector.py | 0 3 files changed, 5 insertions(+), 4 deletions(-) delete mode 100644 tests/test_automated/integration/source_collectors/__init__.py delete mode 100644 tests/test_automated/integration/source_collectors/test_example_collector.py diff --git a/source_collectors/auto_googler/AutoGooglerCollector.py b/source_collectors/auto_googler/AutoGooglerCollector.py index 1748d911..01387d0b 100644 --- a/source_collectors/auto_googler/AutoGooglerCollector.py +++ b/source_collectors/auto_googler/AutoGooglerCollector.py @@ -1,13 +1,13 @@ -import asyncio from collector_manager.AsyncCollectorBase import AsyncCollectorBase from collector_manager.enums import CollectorType +from core.EnvVarManager import EnvVarManager from core.preprocessors.AutoGooglerPreprocessor import AutoGooglerPreprocessor from source_collectors.auto_googler.AutoGoogler import AutoGoogler from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO, AutoGooglerInnerOutputDTO from source_collectors.auto_googler.GoogleSearcher import GoogleSearcher from source_collectors.auto_googler.SearchConfig import SearchConfig -from util.helper_functions import get_from_env, base_model_list_dump +from util.helper_functions import base_model_list_dump class AutoGooglerCollector(AsyncCollectorBase): @@ -16,14 +16,15 @@ class AutoGooglerCollector(AsyncCollectorBase): async def run_to_completion(self) -> AutoGoogler: dto: AutoGooglerInputDTO = self.dto + env_var_manager = EnvVarManager.get() auto_googler = AutoGoogler( search_config=SearchConfig( urls_per_result=dto.urls_per_result, queries=dto.queries, ), google_searcher=GoogleSearcher( - api_key=get_from_env("GOOGLE_API_KEY"), - cse_id=get_from_env("GOOGLE_CSE_ID"), + api_key=env_var_manager.google_api_key, + cse_id=env_var_manager.google_cse_id, ) ) async for log in auto_googler.run(): diff --git a/tests/test_automated/integration/source_collectors/__init__.py b/tests/test_automated/integration/source_collectors/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_automated/integration/source_collectors/test_example_collector.py b/tests/test_automated/integration/source_collectors/test_example_collector.py deleted file mode 100644 index e69de29b..00000000 From 06406bdaded60121a9b0df113f515f0a6fae2c96 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 10:34:53 -0400 Subject: [PATCH 121/182] refactor(app): refactor to reduce memory strain from huggingface task --- .../URLRelevanceHuggingfaceTaskOperator.py | 2 +- hugging_face/HuggingFaceInterface.py | 48 +++++++++---------- hugging_face/relevancy_worker.py | 15 ++++++ .../test_hugging_face_interface.py | 8 ++-- 4 files changed, 45 insertions(+), 28 deletions(-) create mode 100644 hugging_face/relevancy_worker.py diff --git a/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py b/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py index 4871a9f0..49aa7aa0 100644 --- a/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py +++ b/core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py @@ -46,7 +46,7 @@ async def put_results_into_database(self, tdos): async def add_huggingface_relevancy(self, tdos: list[URLRelevanceHuggingfaceTDO]): urls_with_html = [tdo.url_with_html for tdo in tdos] - results = self.huggingface_interface.get_url_relevancy(urls_with_html) + results = await self.huggingface_interface.get_url_relevancy_async(urls_with_html) for tdo, result in zip(tdos, results): tdo.relevant = result diff --git a/hugging_face/HuggingFaceInterface.py b/hugging_face/HuggingFaceInterface.py index 87d88caf..9ad11d0b 100644 --- a/hugging_face/HuggingFaceInterface.py +++ b/hugging_face/HuggingFaceInterface.py @@ -1,34 +1,34 @@ -from transformers import pipeline +import asyncio +import json +import sys +from typing import List from collector_db.DTOs.URLWithHTML import URLWithHTML -import gc class HuggingFaceInterface: @staticmethod - def load_relevancy_model() -> pipeline: - return pipeline("text-classification", model="PDAP/url-relevance") - - def get_url_relevancy( - self, - urls_with_html: list[URLWithHTML], - threshold: float = 0.5 - ) -> list[bool]: - urls = [url_with_html.url for url_with_html in urls_with_html] - relevance_pipe = self.load_relevancy_model() - results: list[dict] = relevance_pipe(urls) - - bool_results = [] - for result in results: - score = result["score"] - if score >= threshold: - bool_results.append(True) - else: - bool_results.append(False) - del relevance_pipe - gc.collect() - return bool_results + async def get_url_relevancy_async(urls_with_html: List[URLWithHTML]) -> List[bool]: + urls = [u.url for u in urls_with_html] + input_data = json.dumps(urls) + proc = await asyncio.create_subprocess_exec( + sys.executable, "hugging_face/relevancy_worker.py", + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate(input=input_data.encode("utf-8")) + raw_output = stdout.decode("utf-8").strip() + + # Try to extract the actual JSON line + for line in raw_output.splitlines(): + try: + return json.loads(line) + except json.JSONDecodeError: + continue + + raise RuntimeError(f"Could not parse JSON from subprocess: {raw_output}") diff --git a/hugging_face/relevancy_worker.py b/hugging_face/relevancy_worker.py new file mode 100644 index 00000000..5d07d10f --- /dev/null +++ b/hugging_face/relevancy_worker.py @@ -0,0 +1,15 @@ +import sys +import json +from transformers import pipeline + +def main(): + urls = json.loads(sys.stdin.read()) + + pipe = pipeline("text-classification", model="PDAP/url-relevance") + results = pipe(urls) + bools = [r["score"] >= 0.5 for r in results] + + print(json.dumps(bools)) + +if __name__ == "__main__": + main() diff --git a/tests/manual/huggingface/test_hugging_face_interface.py b/tests/manual/huggingface/test_hugging_face_interface.py index b1b86350..08ce8ccd 100644 --- a/tests/manual/huggingface/test_hugging_face_interface.py +++ b/tests/manual/huggingface/test_hugging_face_interface.py @@ -1,13 +1,15 @@ +import pytest + from collector_db.DTOs.URLWithHTML import URLWithHTML from hugging_face.HuggingFaceInterface import HuggingFaceInterface - -def test_get_url_relevancy(): +@pytest.mark.asyncio +async def test_get_url_relevancy(): hfi = HuggingFaceInterface() def package_url(url: str) -> URLWithHTML: return URLWithHTML(url=url, url_id=1, html_infos=[]) - result = hfi.get_url_relevancy([ + result = await hfi.get_url_relevancy_async([ package_url("https://coloradosprings.gov/police-department/article/news/i-25-traffic-safety-deployment-after-stop"), package_url("https://example.com"), package_url("https://police.com") From 0c902809910408c7835e4d602668bbade7331b41 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 10:41:38 -0400 Subject: [PATCH 122/182] fix(tests): fix broken test --- .../tasks/test_url_relevancy_huggingface_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py index abe15965..95fb5fc7 100644 --- a/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py +++ b/tests/test_automated/integration/tasks/test_url_relevancy_huggingface_task.py @@ -21,7 +21,7 @@ def num_to_bool(num: int) -> bool: else: return False - def mock_get_url_relevancy( + async def mock_get_url_relevancy( urls_with_html: list[URLWithHTML], threshold: float = 0.8 ) -> list[bool]: @@ -33,7 +33,7 @@ def mock_get_url_relevancy( return results mock_hf_interface = MagicMock(spec=HuggingFaceInterface) - mock_hf_interface.get_url_relevancy = mock_get_url_relevancy + mock_hf_interface.get_url_relevancy_async = mock_get_url_relevancy task_operator = URLRelevanceHuggingfaceTaskOperator( adb_client=AsyncDatabaseClient(), @@ -50,7 +50,7 @@ def mock_get_url_relevancy( await db_data_creator.html_data(url_ids) run_info: TaskOperatorRunInfo = await task_operator.run_task(1) - assert run_info.outcome == TaskOperatorOutcome.SUCCESS + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message results = await db_data_creator.adb_client.get_all(AutoRelevantSuggestion) From 36d8b5d815645e2e9b667ca504e50f0af670afa4 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 13:57:55 -0400 Subject: [PATCH 123/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- collector_manager/ExampleCollector.py | 9 ++++-- tests/helpers/AwaitableBarrier.py | 13 ++++++++ tests/helpers/patch_functions.py | 10 +++++++ .../integration/api/test_duplicates.py | 5 +--- .../integration/api/test_example_collector.py | 25 +++++++++++----- .../core/test_example_collector_lifecycle.py | 30 ++++++++++++++----- .../unit/core/test_core_logger.py | 4 +-- 7 files changed, 73 insertions(+), 23 deletions(-) create mode 100644 tests/helpers/AwaitableBarrier.py create mode 100644 tests/helpers/patch_functions.py diff --git a/collector_manager/ExampleCollector.py b/collector_manager/ExampleCollector.py index 9f451732..7bc8a583 100644 --- a/collector_manager/ExampleCollector.py +++ b/collector_manager/ExampleCollector.py @@ -21,9 +21,14 @@ async def run_implementation(self) -> None: sleep_time = dto.sleep_time for i in range(sleep_time): # Simulate a task await self.log(f"Step {i + 1}/{sleep_time}") - await asyncio.sleep(1) # Simulate work + await self.sleep() self.data = ExampleOutputDTO( message=f"Data collected by {self.batch_id}", urls=["https://example.com", "https://example.com/2"], parameters=self.dto.model_dump(), - ) \ No newline at end of file + ) + + @staticmethod + async def sleep(): + # Simulate work + await asyncio.sleep(1) \ No newline at end of file diff --git a/tests/helpers/AwaitableBarrier.py b/tests/helpers/AwaitableBarrier.py new file mode 100644 index 00000000..8bf65a11 --- /dev/null +++ b/tests/helpers/AwaitableBarrier.py @@ -0,0 +1,13 @@ +import asyncio + + +class AwaitableBarrier: + def __init__(self): + self._event = asyncio.Event() + + async def __call__(self, *args, **kwargs): + await self._event.wait() + + def release(self): + self._event.set() + diff --git a/tests/helpers/patch_functions.py b/tests/helpers/patch_functions.py new file mode 100644 index 00000000..bb805d29 --- /dev/null +++ b/tests/helpers/patch_functions.py @@ -0,0 +1,10 @@ +from tests.helpers.AwaitableBarrier import AwaitableBarrier + + +async def block_sleep(monkeypatch) -> AwaitableBarrier: + barrier = AwaitableBarrier() + monkeypatch.setattr( + "collector_manager.ExampleCollector.ExampleCollector.sleep", + barrier + ) + return barrier diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index a5c77b29..babffeba 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -12,7 +12,7 @@ def test_duplicates(api_test_helper): disable_task_trigger(ath) dto = ExampleInputDTO( - sleep_time=1 + sleep_time=0 ) batch_id_1 = ath.request_validator.example_collector( @@ -21,15 +21,12 @@ def test_duplicates(api_test_helper): assert batch_id_1 is not None - time.sleep(1) - batch_id_2 = ath.request_validator.example_collector( dto=dto )["batch_id"] assert batch_id_2 is not None - time.sleep(1.5) bi_1: BatchInfo = ath.request_validator.get_batch_info(batch_id_1) bi_2: BatchInfo = ath.request_validator.get_batch_info(batch_id_2) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index b13f7e31..d7ec88fd 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -1,6 +1,5 @@ import asyncio -import time -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import AsyncMock import pytest @@ -14,24 +13,29 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.enums import BatchStatus +from tests.helpers.patch_functions import block_sleep from tests.test_automated.integration.api.conftest import disable_task_trigger @pytest.mark.asyncio -async def test_example_collector(api_test_helper): +async def test_example_collector(api_test_helper, monkeypatch): ath = api_test_helper + barrier = await block_sleep(monkeypatch) + # Temporarily disable task trigger disable_task_trigger(ath) + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient(), flush_interval=1) await logger.__aenter__() ath.async_core.collector_manager.logger = logger dto = ExampleInputDTO( - sleep_time=1 - ) + sleep_time=1 + ) + # Request Example Collector data = ath.request_validator.example_collector( dto=dto ) @@ -39,10 +43,14 @@ async def test_example_collector(api_test_helper): assert batch_id is not None assert data["message"] == "Started example collector." + # Yield control so coroutine runs up to the barrier + await asyncio.sleep(0) + + + # Check that batch currently shows as In Process bsr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( status=BatchStatus.IN_PROCESS ) - assert len(bsr.results) == 1 bsi: BatchStatusInfo = bsr.results[0] @@ -50,7 +58,8 @@ async def test_example_collector(api_test_helper): assert bsi.strategy == CollectorType.EXAMPLE.value assert bsi.status == BatchStatus.IN_PROCESS - await asyncio.sleep(2) + # Release the barrier to resume execution + barrier.release() csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, @@ -113,7 +122,7 @@ async def test_example_collector_error(api_test_helper, monkeypatch): assert batch_id is not None assert data["message"] == "Started example collector." - await asyncio.sleep(1) + await asyncio.sleep(0) bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index a9c4900f..65ffc001 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -9,11 +9,14 @@ from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.SourceCollectorCore import SourceCollectorCore from core.enums import BatchStatus +from tests.helpers.patch_functions import block_sleep + @pytest.mark.asyncio async def test_example_collector_lifecycle( test_core: SourceCollectorCore, - test_async_core: AsyncCore + test_async_core: AsyncCore, + monkeypatch ): """ Test the flow of an example collector, which generates fake urls @@ -22,6 +25,9 @@ async def test_example_collector_lifecycle( acore = test_async_core core = test_core db_client = core.db_client + + barrier = await block_sleep(monkeypatch) + dto = ExampleInputDTO( example_field="example_value", sleep_time=1 @@ -36,11 +42,13 @@ async def test_example_collector_lifecycle( batch_id = csi.batch_id + # Yield control so coroutine runs up to the barrier + await asyncio.sleep(0) + assert core.get_status(batch_id) == BatchStatus.IN_PROCESS - print("Sleeping for 1.5 seconds...") - await asyncio.sleep(1.5) + # Release the barrier to resume execution + barrier.release() await acore.collector_manager.logger.flush_all() - print("Done sleeping...") assert core.get_status(batch_id) == BatchStatus.READY_TO_LABEL batch_info: BatchInfo = db_client.get_batch_by_id(batch_id) @@ -48,7 +56,7 @@ async def test_example_collector_lifecycle( assert batch_info.status == BatchStatus.READY_TO_LABEL assert batch_info.total_url_count == 2 assert batch_info.parameters == dto.model_dump() - assert batch_info.compute_time > 1 + assert batch_info.compute_time > 0 url_infos = db_client.get_urls_by_batch(batch_id) assert len(url_infos) == 2 @@ -61,15 +69,19 @@ async def test_example_collector_lifecycle( @pytest.mark.asyncio async def test_example_collector_lifecycle_multiple_batches( test_core: SourceCollectorCore, - test_async_core: AsyncCore + test_async_core: AsyncCore, + monkeypatch ): """ Test the flow of an example collector, which generates fake urls and saves them to the database """ + barrier = await block_sleep(monkeypatch) acore = test_async_core core = test_core csis: list[CollectorStartInfo] = [] + + for i in range(3): dto = ExampleInputDTO( example_field="example_value", @@ -82,12 +94,16 @@ async def test_example_collector_lifecycle_multiple_batches( ) csis.append(csi) + await asyncio.sleep(0) for csi in csis: print("Batch ID:", csi.batch_id) assert core.get_status(csi.batch_id) == BatchStatus.IN_PROCESS - await asyncio.sleep(3) + barrier.release() + + await asyncio.sleep(0.15) for csi in csis: assert core.get_status(csi.batch_id) == BatchStatus.READY_TO_LABEL + diff --git a/tests/test_automated/unit/core/test_core_logger.py b/tests/test_automated/unit/core/test_core_logger.py index d91ce6cd..b0d52055 100644 --- a/tests/test_automated/unit/core/test_core_logger.py +++ b/tests/test_automated/unit/core/test_core_logger.py @@ -10,14 +10,14 @@ @pytest.mark.asyncio async def test_logger_flush(): mock_adb_client = AsyncMock() - async with AsyncCoreLogger(flush_interval=1, adb_client=mock_adb_client) as logger: + async with AsyncCoreLogger(flush_interval=0.01, adb_client=mock_adb_client) as logger: # Add logs await logger.log(LogInfo(log="Log 1", batch_id=1)) await logger.log(LogInfo(log="Log 2", batch_id=1)) # Wait for the flush interval - await asyncio.sleep(1.5) + await asyncio.sleep(0.02) # Verify logs were flushed mock_adb_client.insert_logs.assert_called_once() From f7a96064c0bc4c73ef2cbe9eee341cf28993cbf2 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 14:03:23 -0400 Subject: [PATCH 124/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- tests/test_automated/integration/api/test_example_collector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index d7ec88fd..92a42317 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -60,6 +60,7 @@ async def test_example_collector(api_test_helper, monkeypatch): # Release the barrier to resume execution barrier.release() + await asyncio.sleep(0) csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, From d2cfc83b4a76cd68531bb81629c2b1fa8a3e3a04 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 14:04:46 -0400 Subject: [PATCH 125/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- tests/test_automated/integration/api/test_duplicates.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index babffeba..e4c8af24 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -1,11 +1,15 @@ +import asyncio import time +import pytest + from collector_db.DTOs.BatchInfo import BatchInfo from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from tests.test_automated.integration.api.conftest import disable_task_trigger -def test_duplicates(api_test_helper): +@pytest.mark.asyncio +async def test_duplicates(api_test_helper): ath = api_test_helper # Temporarily disable task trigger @@ -27,6 +31,8 @@ def test_duplicates(api_test_helper): assert batch_id_2 is not None + await asyncio.sleep(0.1) + bi_1: BatchInfo = ath.request_validator.get_batch_info(batch_id_1) bi_2: BatchInfo = ath.request_validator.get_batch_info(batch_id_2) From 569b46c3cb749e64438c4992a574fdea746a85a2 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 14:07:42 -0400 Subject: [PATCH 126/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- tests/test_automated/integration/api/test_example_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 92a42317..00361cd0 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -60,7 +60,7 @@ async def test_example_collector(api_test_helper, monkeypatch): # Release the barrier to resume execution barrier.release() - await asyncio.sleep(0) + await asyncio.sleep(0.1) csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, From 55fe581dce901058c2cc68cb1a1fb83b1c5ae3f7 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 14:55:54 -0400 Subject: [PATCH 127/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- tests/test_automated/integration/api/test_example_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index 00361cd0..c001963a 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -60,7 +60,7 @@ async def test_example_collector(api_test_helper, monkeypatch): # Release the barrier to resume execution barrier.release() - await asyncio.sleep(0.1) + await asyncio.sleep(0.3) csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, From 9b58948ac71c94ea63066a0a3701fc377d22b8a5 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 16:32:16 -0400 Subject: [PATCH 128/182] refactor(tests): reduce time required to run time-bound tests implement techniques to simulate extended periods of time, instead of simply running things for those periods of time. --- api/main.py | 1 + tests/test_automated/integration/api/conftest.py | 16 +++++++++++++++- .../integration/api/test_duplicates.py | 2 +- .../integration/api/test_example_collector.py | 6 +++--- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/api/main.py b/api/main.py index c993b941..6c5e2018 100644 --- a/api/main.py +++ b/api/main.py @@ -1,3 +1,4 @@ +import asyncio from contextlib import asynccontextmanager import aiohttp diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index 00ee7473..73f0c8ab 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -1,6 +1,7 @@ +import asyncio from dataclasses import dataclass from typing import Generator -from unittest.mock import MagicMock, AsyncMock, patch +from unittest.mock import MagicMock, AsyncMock import pytest import pytest_asyncio @@ -9,7 +10,9 @@ from api.main import app from core.AsyncCore import AsyncCore from api.routes.review import requires_final_review_permission +from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.SourceCollectorCore import SourceCollectorCore +from core.enums import BatchStatus from security_manager.SecurityManager import get_access_info, AccessInfo, Permissions, require_permission from tests.helpers.DBDataCreator import DBDataCreator from tests.test_automated.integration.api.helpers.RequestValidator import RequestValidator @@ -26,6 +29,17 @@ class APITestHelper: def adb_client(self): return self.db_data_creator.adb_client + async def wait_for_all_batches_to_complete(self): + for i in range(20): + data: GetBatchStatusResponse = self.request_validator.get_batch_statuses( + status=BatchStatus.IN_PROCESS + ) + if len(data.results) == 0: + return + print("Waiting...") + await asyncio.sleep(0.1) + raise ValueError("Batches did not complete in expected time") + MOCK_USER_ID = 1 def disable_task_trigger(ath: APITestHelper) -> None: diff --git a/tests/test_automated/integration/api/test_duplicates.py b/tests/test_automated/integration/api/test_duplicates.py index e4c8af24..6c6c42ce 100644 --- a/tests/test_automated/integration/api/test_duplicates.py +++ b/tests/test_automated/integration/api/test_duplicates.py @@ -31,7 +31,7 @@ async def test_duplicates(api_test_helper): assert batch_id_2 is not None - await asyncio.sleep(0.1) + await ath.wait_for_all_batches_to_complete() bi_1: BatchInfo = ath.request_validator.get_batch_info(batch_id_1) diff --git a/tests/test_automated/integration/api/test_example_collector.py b/tests/test_automated/integration/api/test_example_collector.py index c001963a..0b3cf30f 100644 --- a/tests/test_automated/integration/api/test_example_collector.py +++ b/tests/test_automated/integration/api/test_example_collector.py @@ -60,7 +60,8 @@ async def test_example_collector(api_test_helper, monkeypatch): # Release the barrier to resume execution barrier.release() - await asyncio.sleep(0.3) + + await ath.wait_for_all_batches_to_complete() csr: GetBatchStatusResponse = ath.request_validator.get_batch_statuses( collector_type=CollectorType.EXAMPLE, @@ -84,7 +85,6 @@ async def test_example_collector(api_test_helper, monkeypatch): # Flush early to ensure logs are written await logger.flush_all() - lr: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) assert len(lr.logs) > 0 @@ -123,7 +123,7 @@ async def test_example_collector_error(api_test_helper, monkeypatch): assert batch_id is not None assert data["message"] == "Started example collector." - await asyncio.sleep(0) + await ath.wait_for_all_batches_to_complete() bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) From 48f33e62a68daa3d66c44c5bf58848b7e0d66854 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 18 Apr 2025 18:15:41 -0400 Subject: [PATCH 129/182] feat(app): make delete logs job an asynchronous scheduled task Additionally, because now the Scheduled Task Manager is no longer in use, it has been removed and references to it removed. --- api/main.py | 1 - collector_db/AsyncDatabaseClient.py | 15 ++++++- collector_db/DatabaseClient.py | 9 ----- core/ScheduledTaskManager.py | 40 ++++--------------- core/SourceCollectorCore.py | 21 ---------- .../collector_db/test_db_client.py | 2 +- tests/test_automated/integration/conftest.py | 2 - 7 files changed, 23 insertions(+), 67 deletions(-) diff --git a/api/main.py b/api/main.py index 6c5e2018..ae74c914 100644 --- a/api/main.py +++ b/api/main.py @@ -96,7 +96,6 @@ async def lifespan(app: FastAPI): # Clean up resources, close connections, etc. await core_logger.shutdown() await async_core.shutdown() - source_collector_core.shutdown() await session.close() pass diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 9e1ab473..eb68735c 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,8 +1,9 @@ +from datetime import datetime, timedelta from functools import wraps from typing import Optional, Type, Any, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc, delete from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased @@ -1615,3 +1616,15 @@ async def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputIn logs = raw_results.scalars().all() return ([LogOutputInfo(**log.__dict__) for log in logs]) + @session_manager + async def delete_old_logs(self, session): + """ + Delete logs older than a day + """ + statement = delete(Log).where( + Log.created_at < datetime.now() - timedelta(days=1) + ) + await session.execute(statement) + + + diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index b8547f1d..3999dbc9 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -158,15 +158,6 @@ def get_batch_status(self, session, batch_id: int) -> BatchStatus: batch = session.query(Batch).filter_by(id=batch_id).first() return BatchStatus(batch.status) - @session_manager - def delete_old_logs(self, session): - """ - Delete logs older than a day - """ - session.query(Log).filter( - Log.created_at < datetime.now() - timedelta(days=1) - ).delete() - @session_manager def update_url(self, session, url_info: URLInfo): url = session.query(URL).filter_by(id=url_info.id).first() diff --git a/core/ScheduledTaskManager.py b/core/ScheduledTaskManager.py index 5b2ff0a7..0a407d9e 100644 --- a/core/ScheduledTaskManager.py +++ b/core/ScheduledTaskManager.py @@ -1,41 +1,9 @@ from datetime import datetime, timedelta from apscheduler.schedulers.asyncio import AsyncIOScheduler -from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.interval import IntervalTrigger - -from collector_db.DatabaseClient import DatabaseClient from core.AsyncCore import AsyncCore - -class ScheduledTaskManager: - - def __init__(self, db_client: DatabaseClient): - # Dependencies - self.db_client = db_client - - # Main objects - self.scheduler = BackgroundScheduler() - self.scheduler.start() - self.add_scheduled_tasks() - - # Jobs - self.delete_old_logs_job = None - - - def add_scheduled_tasks(self): - self.delete_old_logs_job = self.scheduler.add_job( - self.db_client.delete_old_logs, - trigger=IntervalTrigger( - days=1, - start_date=datetime.now() + timedelta(minutes=10) - ) - ) - - def shutdown(self): - if self.scheduler.running: - self.scheduler.shutdown() - class AsyncScheduledTaskManager: def __init__(self, async_core: AsyncCore): @@ -49,6 +17,7 @@ def __init__(self, async_core: AsyncCore): # Jobs self.run_cycles_job = None + self.delete_logs_job = None def add_scheduled_tasks(self): self.run_cycles_job = self.scheduler.add_job( @@ -59,6 +28,13 @@ def add_scheduled_tasks(self): ), misfire_grace_time=60 ) + self.delete_logs_job = self.scheduler.add_job( + self.async_core.adb_client.delete_old_logs, + trigger=IntervalTrigger( + days=1, + start_date=datetime.now() + timedelta(minutes=10) + ) + ) def shutdown(self): if self.scheduler.running: diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index 6f05a3c4..a4699bf6 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -2,7 +2,6 @@ from collector_db.DatabaseClient import DatabaseClient -from core.ScheduledTaskManager import ScheduledTaskManager from core.enums import BatchStatus @@ -10,30 +9,10 @@ class SourceCollectorCore: def __init__( self, db_client: Optional[DatabaseClient] = None, - dev_mode: bool = False ): if db_client is None: db_client = DatabaseClient() self.db_client = db_client - if not dev_mode: - self.scheduled_task_manager = ScheduledTaskManager(db_client=db_client) - else: - self.scheduled_task_manager = None - def get_status(self, batch_id: int) -> BatchStatus: return self.db_client.get_batch_status(batch_id) - - - def shutdown(self): - if self.scheduled_task_manager is not None: - self.scheduled_task_manager.shutdown() - - - - - -""" -TODO: Add logic for batch processing - -""" \ No newline at end of file diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index 5560577e..93edb3ed 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -94,7 +94,7 @@ async def test_delete_old_logs(db_data_creator: DBDataCreator): db_client.insert_logs(log_infos=log_infos) logs = await adb_client.get_logs_by_batch_id(batch_id=batch_id) assert len(logs) == 3 - db_client.delete_old_logs() + await adb_client.delete_old_logs() logs = await adb_client.get_logs_by_batch_id(batch_id=batch_id) assert len(logs) == 0 diff --git a/tests/test_automated/integration/conftest.py b/tests/test_automated/integration/conftest.py index a0180800..70c79c22 100644 --- a/tests/test_automated/integration/conftest.py +++ b/tests/test_automated/integration/conftest.py @@ -13,10 +13,8 @@ def test_core(db_client_test): core = SourceCollectorCore( db_client=db_client_test, - dev_mode=True ) yield core - core.shutdown() @pytest.fixture From 22aa07e0b7c74bd95fd4945b1a57e8f2dc6e59cf Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 21 Apr 2025 13:15:36 -0400 Subject: [PATCH 130/182] feat(app): Create `/annotation/all` endpoints Create endpoints for receiving and submitting all annotations for a URL at once. --- api/routes/annotate.py | 62 ++++-- collector_db/AsyncDatabaseClient.py | 198 +++++++++++++----- collector_db/StatementComposer.py | 26 ++- core/AsyncCore.py | 22 ++ core/DTOs/AllAnnotationPostInfo.py | 35 ++++ ...GetNextRecordTypeAnnotationResponseInfo.py | 2 +- .../GetNextURLForAllAnnotationResponse.py | 24 +++ core/DTOs/GetNextURLForAnnotationResponse.py | 9 - core/exceptions.py | 10 + .../api/helpers/RequestValidator.py | 37 +++- .../integration/api/test_annotate.py | 138 +++++++++++- tests/test_automated/unit/dto/__init__.py | 0 .../unit/dto/test_all_annotation_post_info.py | 37 ++++ 13 files changed, 513 insertions(+), 87 deletions(-) create mode 100644 core/DTOs/AllAnnotationPostInfo.py create mode 100644 core/DTOs/GetNextURLForAllAnnotationResponse.py delete mode 100644 core/DTOs/GetNextURLForAnnotationResponse.py create mode 100644 tests/test_automated/unit/dto/__init__.py create mode 100644 tests/test_automated/unit/dto/test_all_annotation_post_info.py diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 84ba00e4..95512a0b 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -4,10 +4,12 @@ from api.dependencies import get_async_core from core.AsyncCore import AsyncCore +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo +from core.DTOs.GetNextURLForAllAnnotationResponse import GetNextURLForAllAnnotationResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from security_manager.SecurityManager import get_access_info, AccessInfo @@ -18,6 +20,11 @@ responses={404: {"description": "Not found"}}, ) +batch_query = Query( + description="The batch id of the next URL to get. " + "If not specified, defaults to first qualifying URL", + default=None +) @annotate_router.get("/relevance") async def get_next_url_for_relevance_annotation( @@ -40,10 +47,7 @@ async def annotate_url_for_relevance_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = Query( - description="The batch id of the next URL to get. " - "If not specified, defaults to first qualifying URL", - default=None), + batch_id: Optional[int] = batch_query ) -> GetNextRelevanceAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate @@ -62,10 +66,7 @@ async def annotate_url_for_relevance_and_get_next_url( async def get_next_url_for_record_type_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = Query( - description="The batch id of the next URL to get. " - "If not specified, defaults to first qualifying URL", - default=None), + batch_id: Optional[int] = batch_query ) -> GetNextRecordTypeAnnotationResponseOuterInfo: return await async_core.get_next_url_for_record_type_annotation( user_id=access_info.user_id, @@ -78,10 +79,7 @@ async def annotate_url_for_record_type_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = Query( - description="The batch id of the next URL to get. " - "If not specified, defaults to first qualifying URL", - default=None), + batch_id: Optional[int] = batch_query ) -> GetNextRecordTypeAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate @@ -100,10 +98,7 @@ async def annotate_url_for_record_type_and_get_next_url( async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = Query( - description="The batch id of the next URL to get. " - "If not specified, defaults to first qualifying URL", - default=None), + batch_id: Optional[int] = batch_query ) -> GetNextURLForAgencyAnnotationResponse: return await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, @@ -116,10 +111,7 @@ async def annotate_url_for_agency_and_get_next_url( agency_annotation_post_info: URLAgencyAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = Query( - description="The batch id of the next URL to get. " - "If not specified, defaults to first qualifying URL", - default=None), + batch_id: Optional[int] = batch_query ) -> GetNextURLForAgencyAnnotationResponse: """ Post URL annotation and get next URL to annotate @@ -133,3 +125,33 @@ async def annotate_url_for_agency_and_get_next_url( user_id=access_info.user_id, batch_id=batch_id ) + +@annotate_router.get("/all") +async def get_next_url_for_all_annotations( + access_info: AccessInfo = Depends(get_access_info), + async_core: AsyncCore = Depends(get_async_core), + batch_id: Optional[int] = batch_query +) -> GetNextURLForAllAnnotationResponse: + return await async_core.get_next_url_for_all_annotations( + batch_id=batch_id + ) + +@annotate_router.post("/all/{url_id}") +async def annotate_url_for_all_annotations_and_get_next_url( + url_id: int, + all_annotation_post_info: AllAnnotationPostInfo, + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), + batch_id: Optional[int] = batch_query +) -> GetNextURLForAllAnnotationResponse: + """ + Post URL annotation and get next URL to annotate + """ + await async_core.submit_url_for_all_annotations( + user_id=access_info.user_id, + url_id=url_id, + post_info=all_annotation_post_info + ) + return await async_core.get_next_url_for_all_annotations( + batch_id=batch_id + ) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index eb68735c..46cd89db 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -3,11 +3,10 @@ from typing import Optional, Type, Any, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update, Delete, Insert, asc, delete +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased -from sqlalchemy.sql.functions import coalesce from starlette import status from collector_db.ConfigManager import ConfigManager @@ -23,18 +22,20 @@ from collector_db.DTOs.URLMapping import URLMapping from collector_db.StatementComposer import StatementComposer from collector_db.constants import PLACEHOLDER_AGENCY_NAME -from collector_db.enums import URLMetadataAttributeType, TaskType -from collector_db.helper_functions import get_postgres_connection_string +from collector_db.enums import TaskType from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log from collector_manager.enums import URLStatus, CollectorType +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAgencyInfo, GetNextURLForAgencyAnnotationInnerResponse +from core.DTOs.GetNextURLForAllAnnotationResponse import GetNextURLForAllAnnotationResponse, \ + GetNextURLForAllAnnotationInnerResponse from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo, \ FinalReviewOptionalMetadata from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo @@ -129,7 +130,6 @@ async def get_next_url_for_user_annotation( session: AsyncSession, user_suggestion_model_to_exclude: UserSuggestionModel, auto_suggestion_relationship: QueryableAttribute, - user_id: int, batch_id: Optional[int], check_if_annotated_not_relevant: bool = False ) -> URL: @@ -140,14 +140,7 @@ async def get_next_url_for_user_annotation( .where(URL.outcome == URLStatus.PENDING.value) # URL must not have user suggestion .where( - not_( - exists( - select(user_suggestion_model_to_exclude) - .where( - user_suggestion_model_to_exclude.url_id == URL.id, - ) - ) - ) + StatementComposer.user_suggestion_not_exists(user_suggestion_model_to_exclude) ) ) @@ -213,7 +206,6 @@ async def get_next_url_for_relevance_annotation( session, user_suggestion_model_to_exclude=UserRelevantSuggestion, auto_suggestion_relationship=URL.auto_relevant_suggestion, - user_id=user_id, batch_id=batch_id ) if url is None: @@ -254,7 +246,6 @@ async def get_next_url_for_record_type_annotation( session, user_suggestion_model_to_exclude=UserRecordTypeSuggestion, auto_suggestion_relationship=URL.auto_record_type_suggestion, - user_id=user_id, batch_id=batch_id, check_if_annotated_not_relevant=True ) @@ -823,9 +814,7 @@ async def get_next_url_agency_for_annotation( select(URL.id, URL.url) # Must not have confirmed agencies .where( - and_( - URL.outcome == URLStatus.PENDING.value - ) + URL.outcome == URLStatus.PENDING.value ) ) @@ -838,9 +827,7 @@ async def get_next_url_agency_for_annotation( .where( ~exists( select(UserUrlAgencySuggestion). - where( - UserUrlAgencySuggestion.url_id == URL.id - ). + where(UserUrlAgencySuggestion.url_id == URL.id). correlate(URL) ) ) @@ -885,37 +872,8 @@ async def get_next_url_agency_for_annotation( result = results[0] url_id = result[0] url = result[1] - # Get relevant autosuggestions and agency info, if an associated agency exists - statement = ( - select( - AutomatedUrlAgencySuggestion.agency_id, - AutomatedUrlAgencySuggestion.is_unknown, - Agency.name, - Agency.state, - Agency.county, - Agency.locality - ) - .join(Agency, isouter=True) - .where(AutomatedUrlAgencySuggestion.url_id == url_id) - ) - raw_autosuggestions = await session.execute(statement) - autosuggestions = raw_autosuggestions.all() - agency_suggestions = [] - for autosuggestion in autosuggestions: - agency_id = autosuggestion[0] - is_unknown = autosuggestion[1] - name = autosuggestion[2] - state = autosuggestion[3] - county = autosuggestion[4] - locality = autosuggestion[5] - agency_suggestions.append(GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, - pdap_agency_id=agency_id, - agency_name=name, - state=state, - county=county, - locality=locality - )) + + agency_suggestions = await self.get_agency_suggestions(session, url_id=url_id) # Get HTML content info html_content_infos = await self.get_html_content_info(url_id) @@ -1626,5 +1584,141 @@ async def delete_old_logs(self, session): ) await session.execute(statement) + async def get_agency_suggestions(self, session, url_id: int) -> List[GetNextURLForAgencyAgencyInfo]: + # Get relevant autosuggestions and agency info, if an associated agency exists + + statement = ( + select( + AutomatedUrlAgencySuggestion.agency_id, + AutomatedUrlAgencySuggestion.is_unknown, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .join(Agency, isouter=True) + .where(AutomatedUrlAgencySuggestion.url_id == url_id) + ) + raw_autosuggestions = await session.execute(statement) + autosuggestions = raw_autosuggestions.all() + agency_suggestions = [] + for autosuggestion in autosuggestions: + agency_id = autosuggestion[0] + is_unknown = autosuggestion[1] + name = autosuggestion[2] + state = autosuggestion[3] + county = autosuggestion[4] + locality = autosuggestion[5] + agency_suggestions.append(GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + )) + return agency_suggestions + + @session_manager + async def get_next_url_for_all_annotations(self, session, batch_id: Optional[int] = None) -> GetNextURLForAllAnnotationResponse: + query = ( + Select(URL) + .where( + and_( + URL.outcome == URLStatus.PENDING.value, + StatementComposer.user_suggestion_not_exists(UserUrlAgencySuggestion), + StatementComposer.user_suggestion_not_exists(UserRecordTypeSuggestion), + StatementComposer.user_suggestion_not_exists(UserRelevantSuggestion), + ) + ) + ) + if batch_id is not None: + query = query.where(URL.batch_id == batch_id) + + load_options = [ + URL.html_content, + URL.automated_agency_suggestions, + URL.auto_relevant_suggestion, + URL.auto_record_type_suggestion + ] + select_in_loads = [selectinload(load_option) for load_option in load_options] + + # Add load options + query = query.options( + *select_in_loads + ) + + query = query.order_by(URL.id.asc()).limit(1) + raw_results = await session.execute(query) + url = raw_results.scalars().one_or_none() + if url is None: + return GetNextURLForAllAnnotationResponse( + next_annotation=None + ) + + html_response_info = DTOConverter.html_content_list_to_html_response_info( + url.html_content + ) + + if url.auto_relevant_suggestion is not None: + auto_relevant = url.auto_relevant_suggestion.relevant + else: + auto_relevant = None + + if url.auto_record_type_suggestion is not None: + auto_record_type = url.auto_record_type_suggestion.record_type + else: + auto_record_type = None + + agency_suggestions = await self.get_agency_suggestions(session, url_id=url.id) + + return GetNextURLForAllAnnotationResponse( + next_annotation=GetNextURLForAllAnnotationInnerResponse( + url_id=url.id, + url=url.url, + html_info=html_response_info, + suggested_relevant=auto_relevant, + suggested_record_type=auto_record_type, + agency_suggestions=agency_suggestions + ) + ) + + @session_manager + async def add_all_annotations_to_url( + self, + session, + user_id: int, + url_id: int, + post_info: AllAnnotationPostInfo + ): + + # Add relevant annotation + relevant_suggestion = UserRelevantSuggestion( + url_id=url_id, + user_id=user_id, + relevant=post_info.is_relevant + ) + session.add(relevant_suggestion) + + # If not relevant, do nothing else + if not post_info.is_relevant: + return + + record_type_suggestion = UserRecordTypeSuggestion( + url_id=url_id, + user_id=user_id, + record_type=post_info.record_type.value + ) + session.add(record_type_suggestion) + + agency_suggestion = UserUrlAgencySuggestion( + url_id=url_id, + user_id=user_id, + agency_id=post_info.agency.suggested_agency, + is_new=post_info.agency.is_new + ) + session.add(agency_suggestion) + + diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index e25ba5d4..ca66f6ba 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,11 +1,11 @@ from typing import Any -from sqlalchemy import Select, select, exists, Table, func, Subquery, and_ +from sqlalchemy import Select, select, exists, Table, func, Subquery, and_, not_, ColumnElement from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus, TaskType from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch, \ - ConfirmedURLAgency, LinkTaskURL, Task + ConfirmedURLAgency, LinkTaskURL, Task, UserUrlAgencySuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion from collector_manager.enums import URLStatus, CollectorType from core.enums import BatchStatus @@ -94,4 +94,24 @@ def pending_urls_missing_miscellaneous_metadata_query() -> Select: Batch ) - return query \ No newline at end of file + return query + + + @staticmethod + def user_suggestion_not_exists( + model_to_exclude: UserUrlAgencySuggestion or + UserRecordTypeSuggestion or + UserRelevantSuggestion + ) -> ColumnElement[bool]: + # + + subquery = not_( + exists( + select(model_to_exclude) + .where( + model_to_exclude.url_id == URL.id, + ) + ) + ) + + return subquery \ No newline at end of file diff --git a/core/AsyncCore.py b/core/AsyncCore.py index d436d3c9..92f097db 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -8,6 +8,7 @@ from collector_db.enums import TaskType from collector_manager.AsyncCollectorManager import AsyncCollectorManager from collector_manager.enums import CollectorType +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse @@ -17,6 +18,7 @@ from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo +from core.DTOs.GetNextURLForAllAnnotationResponse import GetNextURLForAllAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo @@ -227,6 +229,26 @@ async def get_next_source_for_review( batch_id=batch_id ) + async def get_next_url_for_all_annotations( + self, + batch_id: Optional[int] + ) -> GetNextURLForAllAnnotationResponse: + return await self.adb_client.get_next_url_for_all_annotations( + batch_id=batch_id + ) + + async def submit_url_for_all_annotations( + self, + user_id: int, + url_id: int, + post_info: AllAnnotationPostInfo + ): + await self.adb_client.add_all_annotations_to_url( + user_id=user_id, + url_id=url_id, + post_info=post_info + ) + async def approve_url( self, approval_info: FinalReviewApprovalInfo, diff --git a/core/DTOs/AllAnnotationPostInfo.py b/core/DTOs/AllAnnotationPostInfo.py new file mode 100644 index 00000000..a462b40b --- /dev/null +++ b/core/DTOs/AllAnnotationPostInfo.py @@ -0,0 +1,35 @@ +from http import HTTPStatus +from typing import Optional + +from fastapi import HTTPException +from pydantic import BaseModel, model_validator + +from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo +from core.enums import RecordType +from core.exceptions import FailedValidationException + + +class AllAnnotationPostInfo(BaseModel): + is_relevant: bool + record_type: Optional[RecordType] = None + agency: Optional[URLAgencyAnnotationPostInfo] = None + + @model_validator(mode="before") + def allow_record_type_and_agency_only_if_relevant(cls, values): + is_relevant = values.get("is_relevant") + record_type = values.get("record_type") + agency = values.get("agency") + + if not is_relevant: + if record_type is not None: + raise FailedValidationException("record_type must be None if is_relevant is False") + + if agency is not None: + raise FailedValidationException("agency must be None if is_relevant is False") + return values + # Similarly, if relevant, record_type and agency must be provided + if record_type is None: + raise FailedValidationException("record_type must be provided if is_relevant is True") + if agency is None: + raise FailedValidationException("agency must be provided if is_relevant is True") + return values \ No newline at end of file diff --git a/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py b/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py index 783b5516..4280e00d 100644 --- a/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py +++ b/core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py @@ -12,7 +12,7 @@ class GetNextRecordTypeAnnotationResponseInfo(BaseModel): title="Information about the URL" ) suggested_record_type: Optional[RecordType] = Field( - title="Whether the auto-labeler identified the URL as relevant or not" + title="What record type, if any, the auto-labeler identified the URL as" ) html_info: ResponseHTMLInfo = Field( title="HTML information about the URL" diff --git a/core/DTOs/GetNextURLForAllAnnotationResponse.py b/core/DTOs/GetNextURLForAllAnnotationResponse.py new file mode 100644 index 00000000..f4fa4bb8 --- /dev/null +++ b/core/DTOs/GetNextURLForAllAnnotationResponse.py @@ -0,0 +1,24 @@ +from typing import Optional + +from pydantic import Field, BaseModel + +from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAgencyInfo +from core.enums import RecordType +from html_tag_collector.DataClassTags import ResponseHTMLInfo + + +class GetNextURLForAllAnnotationInnerResponse(BaseModel): + url_id: int + url: str + html_info: ResponseHTMLInfo + agency_suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] + suggested_relevant: Optional[bool] = Field( + title="Whether the auto-labeler identified the URL as relevant or not" + ) + suggested_record_type: Optional[RecordType] = Field( + title="What record type, if any, the auto-labeler identified the URL as" + ) + + +class GetNextURLForAllAnnotationResponse(BaseModel): + next_annotation: Optional[GetNextURLForAllAnnotationInnerResponse] \ No newline at end of file diff --git a/core/DTOs/GetNextURLForAnnotationResponse.py b/core/DTOs/GetNextURLForAnnotationResponse.py deleted file mode 100644 index b4bc1087..00000000 --- a/core/DTOs/GetNextURLForAnnotationResponse.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo - - -class GetNextURLForAnnotationResponse(BaseModel): - next_annotation: Optional[AnnotationRequestInfo] = None diff --git a/core/exceptions.py b/core/exceptions.py index d9685245..e3e93e55 100644 --- a/core/exceptions.py +++ b/core/exceptions.py @@ -1,3 +1,8 @@ +from http import HTTPStatus + +from fastapi import HTTPException + + class InvalidPreprocessorError(Exception): pass @@ -8,3 +13,8 @@ class MuckrockAPIError(Exception): class MatchAgencyError(Exception): pass + + +class FailedValidationException(HTTPException): + def __init__(self, detail: str): + super().__init__(status_code=HTTPStatus.BAD_REQUEST, detail=detail) \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 4a12bb0e..28e4b4a3 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -10,6 +10,7 @@ from collector_db.enums import TaskType from collector_manager.DTOs.ExampleInputDTO import ExampleInputDTO from collector_manager.enums import CollectorType +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse @@ -18,6 +19,7 @@ from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ URLAgencyAnnotationPostInfo +from core.DTOs.GetNextURLForAllAnnotationResponse import GetNextURLForAllAnnotationResponse from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewOuterResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse @@ -294,4 +296,37 @@ async def get_current_task_status(self) -> GetTaskStatusResponseInfo: data = self.get( url=f"/task/status" ) - return GetTaskStatusResponseInfo(**data) \ No newline at end of file + return GetTaskStatusResponseInfo(**data) + + async def get_next_url_for_all_annotations( + self, + batch_id: Optional[int] = None + ) -> GetNextURLForAllAnnotationResponse: + params = {} + update_if_not_none( + target=params, + source={"batch_id": batch_id} + ) + data = self.get( + url=f"/annotate/all", + params=params + ) + return GetNextURLForAllAnnotationResponse(**data) + + async def post_all_annotations_and_get_next( + self, + url_id: int, + all_annotations_post_info: AllAnnotationPostInfo, + batch_id: Optional[int] = None, + ) -> GetNextURLForAllAnnotationResponse: + params = {} + update_if_not_none( + target=params, + source={"batch_id": batch_id} + ) + data = self.post( + url=f"/annotate/all/{url_id}", + params=params, + json=all_annotations_post_info.model_dump(mode='json') + ) + return GetNextURLForAllAnnotationResponse(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index d5b6dade..a03540a1 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -1,16 +1,20 @@ +from http import HTTPStatus import pytest from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.URLMapping import URLMapping from collector_db.models import UserUrlAgencySuggestion, UserRelevantSuggestion, UserRecordTypeSuggestion +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import RecordType, SuggestionType -from tests.helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency +from core.exceptions import FailedValidationException +from tests.helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency, \ + setup_for_get_next_url_for_final_review from html_tag_collector.DataClassTags import ResponseHTMLInfo from tests.helpers.DBDataCreator import BatchURLCreationInfo from tests.test_automated.integration.api.conftest import MOCK_USER_ID @@ -514,3 +518,135 @@ async def test_annotate_agency_submit_new(api_test_helper): assert len(all_manual_suggestions) == 1 assert all_manual_suggestions[0].is_new +@pytest.mark.asyncio +async def test_annotate_all(api_test_helper): + """ + Test the happy path workflow for the all-annotations endpoint + The user should be able to get a valid URL (filtering on batch id if needed), + submit a full annotation, and receive another URL + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_2 = setup_info_2.url_mapping + + # First, get a valid URL to annotate + get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() + + # Apply the second batch id as a filter and see that a different URL is returned + get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( + batch_id=setup_info_2.batch_id + ) + + assert get_response_1.next_annotation.url_id != get_response_2.next_annotation.url_id + + # Annotate the first and submit + agency_id = await ath.db_data_creator.agency() + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + is_relevant=True, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=False, + suggested_agency=agency_id + ) + ) + ) + assert post_response_1.next_annotation is not None + + # Confirm the second is received + assert post_response_1.next_annotation.url_id == url_mapping_2.url_id + + # Upon submitting the second, confirm that no more URLs are returned through either POST or GET + post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_2.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + is_relevant=False, + ) + ) + assert post_response_2.next_annotation is None + + get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() + assert get_response_3.next_annotation is None + + + # Check that all annotations are present in the database + + # Should be two relevance annotations, one True and one False + all_relevance_suggestions = await adb_client.get_all(UserRelevantSuggestion) + assert len(all_relevance_suggestions) == 2 + assert all_relevance_suggestions[0].relevant == True + assert all_relevance_suggestions[1].relevant == False + + # Should be one agency + all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_agency_suggestions) == 1 + assert all_agency_suggestions[0].is_new == False + assert all_agency_suggestions[0].agency_id == agency_id + + # Should be one record type + all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(all_record_type_suggestions) == 1 + assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value + +@pytest.mark.asyncio +async def test_annotate_all_post_batch_filtering(api_test_helper): + """ + Batch filtering should also work when posting annotations + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + setup_info_3 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_3 = setup_info_3.url_mapping + + # Submit the first annotation, using the third batch id, and receive the third URL + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + batch_id=setup_info_3.batch_id, + all_annotations_post_info=AllAnnotationPostInfo( + is_relevant=True, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=True + ) + ) + ) + + assert post_response_1.next_annotation.url_id == url_mapping_3.url_id + + +@pytest.mark.asyncio +async def test_annotate_all_validation_error(api_test_helper): + """ + Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response + """ + ath = api_test_helper + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + + with pytest.raises(FailedValidationException) as e: + response = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + is_relevant=False, + record_type=RecordType.ACCIDENT_REPORTS + ) + ) diff --git a/tests/test_automated/unit/dto/__init__.py b/tests/test_automated/unit/dto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_automated/unit/dto/test_all_annotation_post_info.py b/tests/test_automated/unit/dto/test_all_annotation_post_info.py new file mode 100644 index 00000000..3e5cbab4 --- /dev/null +++ b/tests/test_automated/unit/dto/test_all_annotation_post_info.py @@ -0,0 +1,37 @@ +import pytest +from pydantic import ValidationError + +from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo +from core.enums import RecordType +from core.exceptions import FailedValidationException + +# Mock values to pass +mock_record_type = RecordType.ARREST_RECORDS.value # replace with valid RecordType if Enum +mock_agency = {"is_new": False, "suggested_agency": 1} # replace with a valid dict for the URLAgencyAnnotationPostInfo model + +@pytest.mark.parametrize( + "is_relevant, record_type, agency, should_raise", + [ + (True, mock_record_type, mock_agency, False), # valid + (True, None, mock_agency, True), # missing record_type + (True, mock_record_type, None, True), # missing agency + (True, None, None, True), # missing both + (False, None, None, False), # valid + (False, mock_record_type, None, True), # record_type present + (False, None, mock_agency, True), # agency present + (False, mock_record_type, mock_agency, True), # both present + ] +) +def test_all_annotation_post_info_validation(is_relevant, record_type, agency, should_raise): + data = { + "is_relevant": is_relevant, + "record_type": record_type, + "agency": agency + } + + if should_raise: + with pytest.raises(FailedValidationException): + AllAnnotationPostInfo(**data) + else: + model = AllAnnotationPostInfo(**data) + assert model.is_relevant == is_relevant From 72ed9c9d0ae4a5f42e2f17c7536eff064287ba9b Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 21 Apr 2025 16:40:12 -0400 Subject: [PATCH 131/182] DRAFT --- local_database/docker/initdb.d/create-dbs.sql | 3 ++ local_database/docker/initdb.d/setup-fdw.sql | 15 ++++++++ local_database/setup_fdw.sh | 38 +++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 local_database/docker/initdb.d/create-dbs.sql create mode 100644 local_database/docker/initdb.d/setup-fdw.sql create mode 100644 local_database/setup_fdw.sh diff --git a/local_database/docker/initdb.d/create-dbs.sql b/local_database/docker/initdb.d/create-dbs.sql new file mode 100644 index 00000000..1c66dec9 --- /dev/null +++ b/local_database/docker/initdb.d/create-dbs.sql @@ -0,0 +1,3 @@ +-- Creates both logical DBs in one Postgres cluster +CREATE DATABASE data_sources_test_db; +CREATE DATABASE source_collector_test_db; diff --git a/local_database/docker/initdb.d/setup-fdw.sql b/local_database/docker/initdb.d/setup-fdw.sql new file mode 100644 index 00000000..1dd94b4c --- /dev/null +++ b/local_database/docker/initdb.d/setup-fdw.sql @@ -0,0 +1,15 @@ +-- This script connects to db_b and sets up FDW access to db_a +\connect source_collector_test_db; + +CREATE EXTENSION IF NOT EXISTS postgres_fdw; + +CREATE SERVER db_a_server + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (host 'localhost', dbname 'db_a'); + +CREATE USER MAPPING FOR test_source_collector_user + SERVER db_a_server + OPTIONS (user 'test_source_collector_user', password 'HanviliciousHamiltonHilltops'); + +-- Example: import tables from db_a (assuming public schema exists and has tables) +IMPORT FOREIGN SCHEMA public FROM SERVER db_a_server INTO foreign_a; diff --git a/local_database/setup_fdw.sh b/local_database/setup_fdw.sh new file mode 100644 index 00000000..139dedc7 --- /dev/null +++ b/local_database/setup_fdw.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -euo pipefail + +# Defaults (can be overridden) +POSTGRES_HOST="${POSTGRES_HOST:-localhost}" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +DB_A="${DB_A:-db_a}" +DB_B="${DB_B:-db_b}" + +export PGPASSWORD="$POSTGRES_PASSWORD" + +echo "Creating databases $DB_A and $DB_B..." +psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d postgres -p "$POSTGRES_PORT" -c "CREATE DATABASE $DB_A;" || echo "$DB_A already exists" +psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d postgres -p "$POSTGRES_PORT" -c "CREATE DATABASE $DB_B;" || echo "$DB_B already exists" + +echo "Setting up FDW in $DB_B to access $DB_A..." +psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d "$DB_B" -p "$POSTGRES_PORT" < Date: Tue, 22 Apr 2025 09:15:03 -0400 Subject: [PATCH 132/182] Refactor Docker Logic and add Data Sources Dumper Logic --- .github/workflows/test_app.yml | 7 + ...3f1272f94b9_set_up_foreign_data_wrapper.py | 250 ++++++++++++++ local_database/DTOs.py | 52 +++ local_database/DataDumper/dump.sh | 25 +- local_database/DockerInfos.py | 83 +++++ local_database/classes/DockerClient.py | 81 +++++ local_database/classes/DockerContainer.py | 28 ++ local_database/classes/DockerManager.py | 73 ++++ local_database/classes/TimestampChecker.py | 32 ++ local_database/classes/__init__.py | 0 local_database/constants.py | 5 + local_database/create_database.py | 65 ++++ local_database/docker/initdb.d/create-dbs.sql | 3 - local_database/docker/initdb.d/setup-fdw.sql | 15 - local_database/dump_data_sources_schema.py | 18 + local_database/local_db_util.py | 18 + local_database/setup.py | 52 +++ local_database/setup_fdw.sh | 38 --- start_mirrored_local_app.py | 322 +----------------- 19 files changed, 797 insertions(+), 370 deletions(-) create mode 100644 alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py create mode 100644 local_database/DTOs.py create mode 100644 local_database/DockerInfos.py create mode 100644 local_database/classes/DockerClient.py create mode 100644 local_database/classes/DockerContainer.py create mode 100644 local_database/classes/DockerManager.py create mode 100644 local_database/classes/TimestampChecker.py create mode 100644 local_database/classes/__init__.py create mode 100644 local_database/constants.py create mode 100644 local_database/create_database.py delete mode 100644 local_database/docker/initdb.d/create-dbs.sql delete mode 100644 local_database/docker/initdb.d/setup-fdw.sql create mode 100644 local_database/dump_data_sources_schema.py create mode 100644 local_database/local_db_util.py create mode 100644 local_database/setup.py delete mode 100644 local_database/setup_fdw.sh diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index e16d1771..28a41e29 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -35,6 +35,13 @@ jobs: --health-retries 5 steps: + - name: Set up FDW + run: | + ./local_database/setup_fdw.sh + env: + POSTGRES_HOST: postgres + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres - name: Checkout repository uses: actions/checkout@v4 - name: Install dependencies diff --git a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py new file mode 100644 index 00000000..5c1adf18 --- /dev/null +++ b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py @@ -0,0 +1,250 @@ +"""Set up foreign data wrapper + +Revision ID: 13f1272f94b9 +Revises: e285e6e7cf71 +Create Date: 2025-04-21 18:17:34.593973 + +""" +import os +from typing import Sequence, Union + +from alembic import op +from dotenv import load_dotenv + +# revision identifiers, used by Alembic. +revision: str = '13f1272f94b9' +down_revision: Union[str, None] = 'e285e6e7cf71' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + + load_dotenv() + remote_host = os.getenv("DATA_SOURCES_HOST") + user = os.getenv("DATA_SOURCES_USER") + password = os.getenv("DATA_SOURCES_PASSWORD") + db_name = os.getenv("DATA_SOURCES_DB") + port = os.getenv("DATA_SOURCES_PORT") + + op.execute(f"CREATE EXTENSION IF NOT EXISTS postgres_fdw;") + + op.execute(f""" + CREATE SERVER data_sources_server + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS (host '{remote_host}', dbname '{db_name}', port '{port}'); + """) + + op.execute(f""" + CREATE USER MAPPING FOR {user} + SERVER data_sources_server + OPTIONS (user '{user}', password '{password}'); + """) + + op.execute('CREATE SCHEMA if not exists "remote";') + + # Users table + op.execute(""" + CREATE FOREIGN TABLE IF NOT EXISTS "remote".users + ( + id bigint, + created_at timestamp with time zone, + updated_at timestamp with time zone, + email text, + password_digest text, + api_key character varying, + role text + ) + SERVER data_sources_server + OPTIONS ( + schema_name 'public', + table_name 'users' + ); + """) + + # Agencies + # -Enums + # --Jurisdiction Type + op.execute(""" + CREATE TYPE jurisdiction_type AS ENUM + ('school', 'county', 'local', 'port', 'tribal', 'transit', 'state', 'federal'); + """) + # --Agency Type + op.execute(""" + CREATE TYPE agency_type AS ENUM + ('incarceration', 'law enforcement', 'aggregated', 'court', 'unknown'); + """) + + # -Table + op.execute(""" + CREATE FOREIGN TABLE IF NOT EXISTS "remote".agencies + ( + name character , + homepage_url character , + jurisdiction_type jurisdiction_type , + lat double precision, + lng double precision, + defunct_year character , + airtable_uid character , + agency_type agency_type , + multi_agency boolean , + no_web_presence boolean , + airtable_agency_last_modified timestamp with time zone, + rejection_reason character , + last_approval_editor character , + submitter_contact character, + agency_created timestamp with time zone, + id integer, + approval_status text, + creator_user_id integer + ) + SERVER data_sources_server + OPTIONS ( + schema_name 'public', + table_name 'agencies' + ); + """) + + # Locations Table + # -Enums + # --Location Type + op.execute(""" + CREATE TYPE location_type AS ENUM + ('State', 'County', 'Locality'); + """) + + # -Table + op.execute(""" + CREATE FOREIGN TABLE IF NOT EXISTS "remote".locations + ( + id bigint, + type location_type, + state_id bigint, + county_id bigint, + locality_id bigint + ) + SERVER data_sources_server + OPTIONS ( + schema_name 'public', + table_name 'locations' + ); + """) + + # Data Sources Table + + # -Enums + # -- access_type + op.execute(""" + CREATE TYPE access_type AS ENUM + ('Download', 'Webpage', 'API'); + """) + + # -- agency_aggregation + op.execute(""" + CREATE TYPE agency_aggregation AS ENUM + ('county', 'local', 'state', 'federal'); + """) + # -- update_method + op.execute(""" + CREATE TYPE update_method AS ENUM + ('Insert', 'No updates', 'Overwrite'); + """) + + # -- detail_level + op.execute(""" + CREATE TYPE detail_level AS ENUM + ('Individual record', 'Aggregated records', 'Summarized totals'); + """) + + # -- retention_schedule + op.execute(""" + CREATE TYPE retention_schedule AS ENUM + ('< 1 day', '1 day', '< 1 week', '1 week', '1 month', '< 1 year', '1-10 years', '> 10 years', 'Future only'); + """) + + # -Table + op.execute(""" + CREATE FOREIGN TABLE IF NOT EXISTS "remote".data_sources + ( + name character varying , + description character , + source_url character , + agency_supplied boolean, + supplying_entity character , + agency_originated boolean, + agency_aggregation agency_aggregation, + coverage_start date, + coverage_end date, + updated_at timestamp with time zone , + detail_level detail_level, + record_download_option_provided boolean, + data_portal_type character , + update_method update_method, + readme_url character , + originating_entity character , + retention_schedule retention_schedule, + airtable_uid character , + scraper_url character , + created_at timestamp with time zone , + submission_notes character , + rejection_note character , + submitter_contact_info character , + agency_described_not_in_database character , + data_portal_type_other character , + data_source_request character , + broken_source_url_as_of timestamp with time zone, + access_notes text , + url_status text , + approval_status text , + record_type_id integer, + access_types access_type[], + tags text[] , + record_formats text[] , + id integer, + approval_status_updated_at timestamp with time zone , + last_approval_editor bigint + ) + SERVER data_sources_server + OPTIONS ( + schema_name 'public', + table_name 'data_sources' + ); + """) + + + +def downgrade() -> None: + # Drop foreign schema + op.execute('DROP SCHEMA IF EXISTS "remote" CASCADE;') + + # Drop enums + enums = [ + "jurisdiction_type", + "agency_type", + "location_type", + "access_type", + "agency_aggregation", + "update_method", + "detail_level", + "retention_schedule", + ] + for enum in enums: + op.execute(f""" + DROP TYPE IF EXISTS {enum}; + """) + + # Drop user mapping + user = os.getenv("DATA_SOURCES_USER") + op.execute(f""" + DROP USER MAPPING FOR {user} SERVER data_sources_server; + """) + + # Drop server + op.execute(""" + DROP SERVER IF EXISTS data_sources_server CASCADE; + """) + + # Drop FDW + op.execute(""" + DROP EXTENSION IF EXISTS postgres_fdw CASCADE; + """) diff --git a/local_database/DTOs.py b/local_database/DTOs.py new file mode 100644 index 00000000..c4c5ff80 --- /dev/null +++ b/local_database/DTOs.py @@ -0,0 +1,52 @@ +from typing import Annotated, Optional + +from pydantic import BaseModel, AfterValidator + +from local_database.local_db_util import is_absolute_path, get_absolute_path + + +class VolumeInfo(BaseModel): + host_path: str + container_path: Annotated[str, AfterValidator(is_absolute_path)] + + def build_volumes(self): + return { + get_absolute_path(self.host_path): { + "bind": self.container_path, + "mode": "rw" + } + } + + +class DockerfileInfo(BaseModel): + image_tag: str + dockerfile_directory: Optional[str] = None + + +class HealthCheckInfo(BaseModel): + test: list[str] + interval: int + timeout: int + retries: int + start_period: int + + def build_healthcheck(self) -> dict: + multiplicative_factor = 1000000000 # Assume 1 second + return { + "test": self.test, + "interval": self.interval * multiplicative_factor, + "timeout": self.timeout * multiplicative_factor, + "retries": self.retries, + "start_period": self.start_period * multiplicative_factor + } + + +class DockerInfo(BaseModel): + dockerfile_info: DockerfileInfo + volume_info: Optional[VolumeInfo] = None + name: str + ports: Optional[dict] = None + environment: Optional[dict] + command: Optional[str] = None + entrypoint: Optional[list[str]] = None + health_check_info: Optional[HealthCheckInfo] = None diff --git a/local_database/DataDumper/dump.sh b/local_database/DataDumper/dump.sh index 9c07c0ca..482a3ca1 100644 --- a/local_database/DataDumper/dump.sh +++ b/local_database/DataDumper/dump.sh @@ -1,15 +1,28 @@ #!/bin/bash #set -e + # Variables (customize these or pass them as environment variables) DB_HOST=${DUMP_HOST:-"postgres_container"} DB_USER=${DUMP_USER:-"your_user"} -DB_PORT=${DUMP_PORT:-"5432"} # Default to 5432 if not provided +DB_PORT=${DUMP_PORT:-"5432"} DB_PASSWORD=${DUMP_PASSWORD:-"your_password"} DB_NAME=${DUMP_NAME:-"your_database"} -DUMP_FILE="/dump/db_dump.sql" +DUMP_FILE=${DUMP_FILE:-"/dump/db_dump.sql"} +DUMP_SCHEMA_ONLY=${DUMP_SCHEMA_ONLY:-false} # Set to "true" to dump only schema + # Export password for pg_dump export PGPASSWORD=$DB_PASSWORD -# Dump the database -echo "Dumping database $DB_NAME from $DB_HOST:$DB_PORT..." -pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME --no-owner --no-acl -F c -f $DUMP_FILE -echo "Dump completed. File saved to $DUMP_FILE." \ No newline at end of file + +# Determine pg_dump flags +PG_DUMP_FLAGS="--no-owner --no-acl -F c" +if [[ "$DUMP_SCHEMA_ONLY" == "true" ]]; then + PG_DUMP_FLAGS="$PG_DUMP_FLAGS --schema-only" + echo "Dumping schema only..." +else + echo "Dumping full database..." +fi + +# Run pg_dump +pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME $PG_DUMP_FLAGS -f $DUMP_FILE + +echo "Dump completed. File saved to $DUMP_FILE." diff --git a/local_database/DockerInfos.py b/local_database/DockerInfos.py new file mode 100644 index 00000000..aecff2b7 --- /dev/null +++ b/local_database/DockerInfos.py @@ -0,0 +1,83 @@ +from local_database.DTOs import DockerInfo, DockerfileInfo, HealthCheckInfo, VolumeInfo +from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME +from util.helper_functions import get_from_env + + +def get_database_docker_info() -> DockerInfo: + return DockerInfo( + dockerfile_info=DockerfileInfo( + image_tag="postgres:15", + ), + name="data_source_identification_db", + ports={ + "5432/tcp": 5432 + }, + environment={ + "POSTGRES_PASSWORD": "HanviliciousHamiltonHilltops", + "POSTGRES_USER": "test_source_collector_user", + "POSTGRES_DB": "source_collector_test_db" + }, + health_check_info=HealthCheckInfo( + test=["pg_isready", "-U", "test_source_collector_user", "-h", "127.0.0.1", "-p", "5432"], + interval=1, + timeout=3, + retries=30, + start_period=2 + ) + ) + + +def get_data_sources_data_dumper_info() -> DockerInfo: + return DockerInfo( + dockerfile_info=DockerfileInfo( + image_tag="datadumper", + dockerfile_directory="DataDumper" + ), + volume_info=VolumeInfo( + host_path="./DataDumper/dump", + container_path="/dump" + ), + name="datadumper", + environment={ + "DUMP_HOST": get_from_env("PROD_DATA_SOURCES_HOST"), + "DUMP_USER": get_from_env("PROD_DATA_SOURCES_USER"), + "DUMP_PASSWORD": get_from_env("PROD_DATA_SOURCES_PASSWORD"), + "DUMP_NAME": get_from_env("PROD_DATA_SOURCES_DB"), + "DUMP_PORT": get_from_env("PROD_DATA_SOURCES_PORT"), + "RESTORE_HOST": get_from_env("POSTGRES_HOST"), + "RESTORE_USER": get_from_env("POSTGRES_USER"), + "RESTORE_PORT": get_from_env("POSTGRES_PORT"), + "RESTORE_DB_NAME": LOCAL_DATA_SOURCES_DB_NAME, + "RESTORE_PASSWORD": get_from_env("POSTGRES_PASSWORD"), + "DUMP_FILE": "/dump/data_sources_db_dump.sql", + "DUMP_SCHEMA_ONLY": "true" + }, + command="bash" + ) + + +def get_source_collector_data_dumper_info() -> DockerInfo: + return DockerInfo( + dockerfile_info=DockerfileInfo( + image_tag="datadumper", + dockerfile_directory="DataDumper" + ), + volume_info=VolumeInfo( + host_path="./DataDumper/dump", + container_path="/dump" + ), + name="datadumper", + environment={ + "DUMP_HOST": get_from_env("DUMP_HOST"), + "DUMP_USER": get_from_env("DUMP_USER"), + "DUMP_PASSWORD": get_from_env("DUMP_PASSWORD"), + "DUMP_NAME": get_from_env("DUMP_DB_NAME"), + "DUMP_PORT": get_from_env("DUMP_PORT"), + "RESTORE_HOST": "data_source_identification_db", + "RESTORE_USER": "test_source_collector_user", + "RESTORE_PORT": "5432", + "RESTORE_DB_NAME": "source_collector_test_db", + "RESTORE_PASSWORD": "HanviliciousHamiltonHilltops", + }, + command="bash" + ) diff --git a/local_database/classes/DockerClient.py b/local_database/classes/DockerClient.py new file mode 100644 index 00000000..bb452748 --- /dev/null +++ b/local_database/classes/DockerClient.py @@ -0,0 +1,81 @@ +import docker +from docker.errors import NotFound, APIError + +from local_database.DTOs import DockerfileInfo, DockerInfo +from local_database.local_db_util import get_absolute_path + + +class DockerClient: + + def __init__(self): + self.client = docker.from_env() + + def run_command(self, command: str, container_id: str): + exec_id = self.client.api.exec_create( + container_id, + cmd=command, + tty=True, + stdin=False + ) + output_stream = self.client.api.exec_start(exec_id=exec_id, stream=True) + for line in output_stream: + print(line.decode().rstrip()) + + def start_network(self, network_name): + try: + self.client.networks.create(network_name, driver="bridge") + except APIError as e: + # Assume already exists + print(e) + return self.client.networks.get(network_name) + + def stop_network(self, network_name): + self.client.networks.get(network_name).remove() + + def get_image(self, dockerfile_info: DockerfileInfo): + if dockerfile_info.dockerfile_directory: + # Build image from Dockerfile + self.client.images.build( + path=get_absolute_path(dockerfile_info.dockerfile_directory), + tag=dockerfile_info.image_tag + ) + else: + # Pull or use existing image + self.client.images.pull(dockerfile_info.image_tag) + + def run_container( + self, + docker_info: DockerInfo, + network_name: str + ): + print(f"Running container {docker_info.name}") + try: + container = self.client.containers.get(docker_info.name) + if container.status == 'running': + print(f"Container '{docker_info.name}' is already running") + return container + print("Restarting container...") + container.start() + return container + except NotFound: + # Container does not exist; proceed to build/pull image and run + pass + + self.get_image(docker_info.dockerfile_info) + + container = self.client.containers.run( + image=docker_info.dockerfile_info.image_tag, + volumes=docker_info.volume_info.build_volumes() if docker_info.volume_info is not None else None, + command=docker_info.command, + entrypoint=docker_info.entrypoint, + detach=True, + name=docker_info.name, + ports=docker_info.ports, + network=network_name, + environment=docker_info.environment, + stdout=True, + stderr=True, + tty=True, + healthcheck=docker_info.health_check_info.build_healthcheck() if docker_info.health_check_info is not None else None + ) + return container diff --git a/local_database/classes/DockerContainer.py b/local_database/classes/DockerContainer.py new file mode 100644 index 00000000..ee2ecba9 --- /dev/null +++ b/local_database/classes/DockerContainer.py @@ -0,0 +1,28 @@ +import time + +from docker.models.containers import Container + +from local_database.classes.DockerClient import DockerClient + + +class DockerContainer: + + def __init__(self, dc: DockerClient, container: Container): + self.dc = dc + self.container = container + + def run_command(self, command: str): + self.dc.run_command(command, self.container.id) + + def stop(self): + self.container.stop() + + def wait_for_pg_to_be_ready(self): + for i in range(30): + exit_code, output = self.container.exec_run("pg_isready") + print(output) + if exit_code == 0: + return + time.sleep(1) + raise Exception("Timed out waiting for postgres to be ready") + diff --git a/local_database/classes/DockerManager.py b/local_database/classes/DockerManager.py new file mode 100644 index 00000000..ab43f852 --- /dev/null +++ b/local_database/classes/DockerManager.py @@ -0,0 +1,73 @@ +import platform +import subprocess +import sys + +import docker +from docker.errors import APIError + +from local_database.DTOs import DockerfileInfo, DockerInfo +from local_database.classes.DockerClient import DockerClient +from local_database.classes.DockerContainer import DockerContainer + + +class DockerManager: + def __init__(self): + if not self.is_docker_running(): + self.start_docker_engine() + + self.client = DockerClient() + self.network_name = "my_network" + self.network = self.start_network() + + @staticmethod + def start_docker_engine(): + system = platform.system() + + match system: + case "Windows": + # Use PowerShell to start Docker Desktop on Windows + subprocess.run([ + "powershell", "-Command", + "Start-Process 'Docker Desktop' -Verb RunAs" + ]) + case "Darwin": + # MacOS: Docker Desktop must be started manually or with open + subprocess.run(["open", "-a", "Docker"]) + case "Linux": + # Most Linux systems use systemctl to manage Docker + subprocess.run(["sudo", "systemctl", "start", "docker"]) + case _: + print(f"Unsupported OS: {system}") + sys.exit(1) + + @staticmethod + def is_docker_running(): + try: + client = docker.from_env() + client.ping() + return True + except docker.errors.DockerException as e: + print(f"Docker is not running: {e}") + return False + + def run_command(self, command: str, container_id: str): + self.client.run_command(command, container_id) + + def start_network(self): + return self.client.start_network(self.network_name) + + def stop_network(self): + self.client.stop_network(self.network_name) + + def get_image(self, dockerfile_info: DockerfileInfo): + self.client.get_image(dockerfile_info) + + def run_container( + self, + docker_info: DockerInfo, + ) -> DockerContainer: + raw_container = self.client.run_container(docker_info, self.network_name) + return DockerContainer(self.client, raw_container) + + def get_containers(self): + return self.client.client.containers.list() \ No newline at end of file diff --git a/local_database/classes/TimestampChecker.py b/local_database/classes/TimestampChecker.py new file mode 100644 index 00000000..56779fd4 --- /dev/null +++ b/local_database/classes/TimestampChecker.py @@ -0,0 +1,32 @@ +import datetime +import os +from typing import Optional + + +class TimestampChecker: + def __init__(self): + self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time() + + def load_last_run_time(self) -> Optional[datetime.datetime]: + # Check if file `last_run.txt` exists + # If it does, load the last run time + if os.path.exists("local_state/last_run.txt"): + with open("local_state/last_run.txt", "r") as f: + return datetime.datetime.strptime( + f.read(), + "%Y-%m-%d %H:%M:%S" + ) + return None + + def last_run_within_24_hours(self): + if self.last_run_time is None: + return False + return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1) + + def set_last_run_time(self): + # If directory `local_state` doesn't exist, create it + if not os.path.exists("local_state"): + os.makedirs("local_state") + + with open("local_state/last_run.txt", "w") as f: + f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) diff --git a/local_database/classes/__init__.py b/local_database/classes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/local_database/constants.py b/local_database/constants.py new file mode 100644 index 00000000..d5c96e72 --- /dev/null +++ b/local_database/constants.py @@ -0,0 +1,5 @@ +LOCAL_DATA_SOURCES_DB_NAME = "test_data_sources_db" +LOCAL_SOURCE_COLLECTOR_DB_NAME = "source_collector_test_db" + +DUMP_SH_DOCKER_PATH = "/usr/local/bin/dump.sh" +RESTORE_SH_DOCKER_PATH = "/usr/local/bin/restore.sh" \ No newline at end of file diff --git a/local_database/create_database.py b/local_database/create_database.py new file mode 100644 index 00000000..b23cc6d2 --- /dev/null +++ b/local_database/create_database.py @@ -0,0 +1,65 @@ +import os +import psycopg2 +from psycopg2 import sql + +from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME, LOCAL_SOURCE_COLLECTOR_DB_NAME + +# Defaults (can be overridden via environment variables) +POSTGRES_HOST = os.getenv("POSTGRES_HOST", "host.docker.internal") +POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432")) +POSTGRES_USER = os.getenv("POSTGRES_USER", "test_source_collector_user") +POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "HanviliciousHamiltonHilltops") + + +# Connect to the default 'postgres' database to create other databases +def connect(database="postgres", autocommit=True): + conn = psycopg2.connect( + dbname=database, + user=POSTGRES_USER, + password=POSTGRES_PASSWORD, + host=POSTGRES_HOST, + port=POSTGRES_PORT + ) + if autocommit: + conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + return conn + +def create_database(db_name): + conn = connect("postgres") + with conn.cursor() as cur: + cur.execute(sql.SQL(""" + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = %s AND pid <> pg_backend_pid() + """), [db_name]) + + # Drop the database if it exists + cur.execute(sql.SQL("DROP DATABASE IF EXISTS {}").format(sql.Identifier(db_name))) + print(f"🗑️ Dropped existing database: {db_name}") + + try: + cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name))) + print(f"✅ Created database: {db_name}") + except psycopg2.errors.DuplicateDatabase: + print(f"⚠️ Database {db_name} already exists") + except Exception as e: + print(f"❌ Failed to create {db_name}: {e}") + +def create_database_tables(): + conn = connect(LOCAL_DATA_SOURCES_DB_NAME) + with conn.cursor() as cur: + cur.execute(""" + CREATE TABLE IF NOT EXISTS test_table ( + id SERIAL PRIMARY KEY, + name VARCHAR(255) NOT NULL + ) + """) + conn.commit() + +def main(): + print("Creating databases...") + create_database(LOCAL_DATA_SOURCES_DB_NAME) + create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME) + +if __name__ == "__main__": + main() diff --git a/local_database/docker/initdb.d/create-dbs.sql b/local_database/docker/initdb.d/create-dbs.sql deleted file mode 100644 index 1c66dec9..00000000 --- a/local_database/docker/initdb.d/create-dbs.sql +++ /dev/null @@ -1,3 +0,0 @@ --- Creates both logical DBs in one Postgres cluster -CREATE DATABASE data_sources_test_db; -CREATE DATABASE source_collector_test_db; diff --git a/local_database/docker/initdb.d/setup-fdw.sql b/local_database/docker/initdb.d/setup-fdw.sql deleted file mode 100644 index 1dd94b4c..00000000 --- a/local_database/docker/initdb.d/setup-fdw.sql +++ /dev/null @@ -1,15 +0,0 @@ --- This script connects to db_b and sets up FDW access to db_a -\connect source_collector_test_db; - -CREATE EXTENSION IF NOT EXISTS postgres_fdw; - -CREATE SERVER db_a_server - FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (host 'localhost', dbname 'db_a'); - -CREATE USER MAPPING FOR test_source_collector_user - SERVER db_a_server - OPTIONS (user 'test_source_collector_user', password 'HanviliciousHamiltonHilltops'); - --- Example: import tables from db_a (assuming public schema exists and has tables) -IMPORT FOREIGN SCHEMA public FROM SERVER db_a_server INTO foreign_a; diff --git a/local_database/dump_data_sources_schema.py b/local_database/dump_data_sources_schema.py new file mode 100644 index 00000000..49627bc3 --- /dev/null +++ b/local_database/dump_data_sources_schema.py @@ -0,0 +1,18 @@ +from local_database.DockerInfos import get_data_sources_data_dumper_info +from local_database.classes.DockerManager import DockerManager +from local_database.constants import DUMP_SH_DOCKER_PATH + + +def main(): + docker_manager = DockerManager() + data_sources_docker_info = get_data_sources_data_dumper_info() + container = docker_manager.run_container(data_sources_docker_info) + try: + container.run_command(DUMP_SH_DOCKER_PATH) + finally: + container.stop() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/local_database/local_db_util.py b/local_database/local_db_util.py new file mode 100644 index 00000000..7bc5bb12 --- /dev/null +++ b/local_database/local_db_util.py @@ -0,0 +1,18 @@ +from pathlib import Path + + +def get_absolute_path(relative_path: str) -> str: + """ + Get absolute path, using the current file as the point of reference + """ + current_dir = Path(__file__).parent + absolute_path = (current_dir / relative_path).resolve() + return str(absolute_path) + + +def is_absolute_path(path: str) -> str: + if len(path) == 0: + raise ValueError("Path is required") + if path[0] != "/": + raise ValueError("Container path must be absolute") + return path diff --git a/local_database/setup.py b/local_database/setup.py new file mode 100644 index 00000000..a720ebc2 --- /dev/null +++ b/local_database/setup.py @@ -0,0 +1,52 @@ +import subprocess +import time +import sys + +POSTGRES_SERVICE_NAME = "postgres" +FOLLOWUP_SCRIPT = "py create_database.py" +MAX_RETRIES = 20 +SLEEP_SECONDS = 1 + +def run_command(cmd, check=True, capture_output=False, **kwargs): + try: + return subprocess.run(cmd, shell=True, check=check, capture_output=capture_output, text=True, **kwargs) + except subprocess.CalledProcessError as e: + print(f"Command '{cmd}' failed: {e}") + sys.exit(1) + +def get_postgres_container_id(): + result = run_command(f"docker-compose ps -q {POSTGRES_SERVICE_NAME}", capture_output=True) + container_id = result.stdout.strip() + if not container_id: + print("Error: Could not find Postgres container.") + sys.exit(1) + return container_id + +def wait_for_postgres(container_id): + print("Waiting for Postgres to be ready...") + for i in range(MAX_RETRIES): + try: + run_command(f"docker exec {container_id} pg_isready -U postgres", check=True) + print("Postgres is ready!") + return + except subprocess.CalledProcessError: + print(f"Still waiting... ({i+1}/{MAX_RETRIES})") + time.sleep(SLEEP_SECONDS) + print("Postgres did not become ready in time.") + sys.exit(1) + +def main(): + print("Stopping Docker Compose...") + run_command("docker-compose down") + + print("Starting Docker Compose...") + run_command("docker-compose up -d") + + container_id = get_postgres_container_id() + wait_for_postgres(container_id) + + print("Running follow-up script...") + run_command(FOLLOWUP_SCRIPT) + +if __name__ == "__main__": + main() diff --git a/local_database/setup_fdw.sh b/local_database/setup_fdw.sh deleted file mode 100644 index 139dedc7..00000000 --- a/local_database/setup_fdw.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# Defaults (can be overridden) -POSTGRES_HOST="${POSTGRES_HOST:-localhost}" -POSTGRES_PORT="${POSTGRES_PORT:-5432}" -POSTGRES_USER="${POSTGRES_USER:-postgres}" -POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" -DB_A="${DB_A:-db_a}" -DB_B="${DB_B:-db_b}" - -export PGPASSWORD="$POSTGRES_PASSWORD" - -echo "Creating databases $DB_A and $DB_B..." -psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d postgres -p "$POSTGRES_PORT" -c "CREATE DATABASE $DB_A;" || echo "$DB_A already exists" -psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d postgres -p "$POSTGRES_PORT" -c "CREATE DATABASE $DB_B;" || echo "$DB_B already exists" - -echo "Setting up FDW in $DB_B to access $DB_A..." -psql -h "$POSTGRES_HOST" -U "$POSTGRES_USER" -d "$DB_B" -p "$POSTGRES_PORT" < str: - if len(path) == 0: - raise ValueError("Path is required") - if path[0] != "/": - raise ValueError("Container path must be absolute") - return path - -class VolumeInfo(BaseModel): - host_path: str - container_path: Annotated[str, AfterValidator(is_absolute_path)] - - def build_volumes(self): - return { - get_absolute_path(self.host_path): { - "bind": self.container_path, - "mode": "rw" - } - } - -def wait_for_pg_to_be_ready(container: Container): - for i in range(30): - exit_code, output = container.exec_run("pg_isready") - print(output) - if exit_code == 0: - return - time.sleep(1) - raise Exception("Timed out waiting for postgres to be ready") - -def get_absolute_path(relative_path: str) -> str: - """ - Get absolute path, using the current file as the point of reference - """ - current_dir = Path(__file__).parent - absolute_path = (current_dir / relative_path).resolve() - return str(absolute_path) - - -class DockerfileInfo(BaseModel): - image_tag: str - dockerfile_directory: Optional[str] = None - - - -class HealthCheckInfo(BaseModel): - test: list[str] - interval: int - timeout: int - retries: int - start_period: int - - def build_healthcheck(self) -> dict: - multiplicative_factor = 1000000000 # Assume 1 second - return { - "test": self.test, - "interval": self.interval * multiplicative_factor, - "timeout": self.timeout * multiplicative_factor, - "retries": self.retries, - "start_period": self.start_period * multiplicative_factor - } - -class DockerInfo(BaseModel): - dockerfile_info: DockerfileInfo - volume_info: Optional[VolumeInfo] = None - name: str - ports: Optional[dict] = None - environment: Optional[dict] - command: Optional[str] = None - entrypoint: Optional[list[str]] = None - health_check_info: Optional[HealthCheckInfo] = None - -def run_command_checked(command: list[str] or str, shell=False): - result = subprocess.run( - command, - check=True, - capture_output=True, - text=True, - shell=shell - ) - return result - -def is_docker_running(): - try: - client = docker.from_env() - client.ping() - return True - except docker.errors.DockerException as e: - print(f"Docker is not running: {e}") - return False - -def wait_for_health(container, timeout=30): - start = time.time() - while time.time() - start < timeout: - container.reload() # Refresh container state - state = container.attrs.get("State") - print(state) - health = container.attrs.get("State", {}).get("Health", {}) - status = health.get("Status") - print(f"Health status: {status}") - if status == "healthy": - print("Postgres is healthy.") - return - elif status == "unhealthy": - raise Exception("Postgres container became unhealthy.") - time.sleep(1) - raise TimeoutError("Timed out waiting for Postgres to become healthy.") - -def start_docker_engine(): - system = platform.system() - - match system: - case "Windows": - # Use PowerShell to start Docker Desktop on Windows - subprocess.run([ - "powershell", "-Command", - "Start-Process 'Docker Desktop' -Verb RunAs" - ]) - case "Darwin": - # MacOS: Docker Desktop must be started manually or with open - subprocess.run(["open", "-a", "Docker"]) - case "Linux": - # Most Linux systems use systemctl to manage Docker - subprocess.run(["sudo", "systemctl", "start", "docker"]) - case _: - print(f"Unsupported OS: {system}") - sys.exit(1) - -class DockerManager: - def __init__(self): - self.client = docker.from_env() - self.network_name = "my_network" - self.network = self.start_network() +from local_database.DockerInfos import get_database_docker_info, get_source_collector_data_dumper_info +from local_database.classes.DockerManager import DockerManager +from local_database.classes.TimestampChecker import TimestampChecker +from local_database.constants import RESTORE_SH_DOCKER_PATH, DUMP_SH_DOCKER_PATH - def run_command(self, command: str, container_id: str): - exec_id = self.client.api.exec_create( - container_id, - cmd=command, - tty=True, - stdin=False - ) - output_stream = self.client.api.exec_start(exec_id=exec_id, stream=True) - for line in output_stream: - print(line.decode().rstrip()) - - def start_network(self): - try: - self.client.networks.create(self.network_name, driver="bridge") - except APIError as e: - # Assume already exists - print(e) - return self.client.networks.get("my_network") - - def stop_network(self): - self.client.networks.get("my_network").remove() - - def get_image(self, dockerfile_info: DockerfileInfo): - if dockerfile_info.dockerfile_directory: - # Build image from Dockerfile - self.client.images.build( - path=get_absolute_path(dockerfile_info.dockerfile_directory), - tag=dockerfile_info.image_tag - ) - else: - # Pull or use existing image - self.client.images.pull(dockerfile_info.image_tag) - - - def run_container( - self, - docker_info: DockerInfo, - ) -> Container: - print(f"Running container {docker_info.name}") - try: - container = self.client.containers.get(docker_info.name) - if container.status == 'running': - print(f"Container '{docker_info.name}' is already running") - return container - print("Restarting container...") - container.start() - return container - except NotFound: - # Container does not exist; proceed to build/pull image and run - pass - - self.get_image(docker_info.dockerfile_info) - - container = self.client.containers.run( - image=docker_info.dockerfile_info.image_tag, - volumes=docker_info.volume_info.build_volumes() if docker_info.volume_info is not None else None, - command=docker_info.command, - entrypoint=docker_info.entrypoint, - detach=True, - name=docker_info.name, - ports=docker_info.ports, - network=self.network_name, - environment=docker_info.environment, - stdout=True, - stderr=True, - tty=True, - healthcheck=docker_info.health_check_info.build_healthcheck() if docker_info.health_check_info is not None else None - ) - return container - - -class TimestampChecker: - def __init__(self): - self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time() - - def load_last_run_time(self) -> Optional[datetime.datetime]: - # Check if file `last_run.txt` exists - # If it does, load the last run time - if os.path.exists("local_state/last_run.txt"): - with open("local_state/last_run.txt", "r") as f: - return datetime.datetime.strptime( - f.read(), - "%Y-%m-%d %H:%M:%S" - ) - return None - - def last_run_within_24_hours(self): - if self.last_run_time is None: - return False - return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1) - - def set_last_run_time(self): - # If directory `local_state` doesn't exist, create it - if not os.path.exists("local_state"): - os.makedirs("local_state") - - with open("local_state/last_run.txt", "w") as f: - f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - -def get_database_docker_info() -> DockerInfo: - return DockerInfo( - dockerfile_info=DockerfileInfo( - image_tag="postgres:15", - ), - name="data_source_identification_db", - ports={ - "5432/tcp": 5432 - }, - environment={ - "POSTGRES_PASSWORD": "HanviliciousHamiltonHilltops", - "POSTGRES_USER": "test_source_collector_user", - "POSTGRES_DB": "source_collector_test_db" - }, - health_check_info=HealthCheckInfo( - test=["pg_isready", "-U", "test_source_collector_user", "-h", "127.0.0.1", "-p", "5432"], - interval=1, - timeout=3, - retries=30, - start_period=2 - ) - ) - -def get_data_dumper_docker_info() -> DockerInfo: - return DockerInfo( - dockerfile_info=DockerfileInfo( - image_tag="datadumper", - dockerfile_directory="local_database/DataDumper" - ), - volume_info=VolumeInfo( - host_path="./local_database/DataDumper/dump", - container_path="/dump" - ), - name="datadumper", - environment={ - "DUMP_HOST": get_from_env("DUMP_HOST"), - "DUMP_USER": get_from_env("DUMP_USER"), - "DUMP_PASSWORD": get_from_env("DUMP_PASSWORD"), - "DUMP_NAME": get_from_env("DUMP_DB_NAME"), - "DUMP_PORT": get_from_env("DUMP_PORT"), - "RESTORE_HOST": "data_source_identification_db", - "RESTORE_USER": "test_source_collector_user", - "RESTORE_PORT": "5432", - "RESTORE_DB_NAME": "source_collector_test_db", - "RESTORE_PASSWORD": "HanviliciousHamiltonHilltops", - }, - command="bash" - ) def main(): docker_manager = DockerManager() - # Ensure docker is running, and start if not - if not is_docker_running(): - start_docker_engine() # Ensure Dockerfile for database is running, and if not, start it database_docker_info = get_database_docker_info() - container = docker_manager.run_container(database_docker_info) - wait_for_pg_to_be_ready(container) + db_container = docker_manager.run_container(database_docker_info) + db_container.wait_for_pg_to_be_ready() # Start dockerfile for Datadumper - data_dumper_docker_info = get_data_dumper_docker_info() + data_dumper_docker_info = get_source_collector_data_dumper_info() # If not last run within 24 hours, run dump operation in Datadumper # Check cache if exists and checker = TimestampChecker() - container = docker_manager.run_container(data_dumper_docker_info) + data_dump_container = docker_manager.run_container(data_dumper_docker_info) if checker.last_run_within_24_hours(): print("Last run within 24 hours, skipping dump...") else: - docker_manager.run_command( - '/usr/local/bin/dump.sh', - container.id + data_dump_container.run_command( + DUMP_SH_DOCKER_PATH, ) - docker_manager.run_command( - "/usr/local/bin/restore.sh", - container.id + data_dump_container.run_command( + RESTORE_SH_DOCKER_PATH, ) print("Stopping datadumper container") - container.stop() + data_dump_container.stop() checker.set_last_run_time() # Upgrade using alembic @@ -351,7 +57,7 @@ def main(): finally: # Add feature to stop all running containers print("Stopping containers...") - for container in docker_manager.client.containers.list(): + for container in docker_manager.get_containers(): container.stop() print("Containers stopped.") From ca27fdbe6a2252b0615267687fb10508f9c9e3cf Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 22 Apr 2025 11:01:17 -0400 Subject: [PATCH 133/182] add .gitattributes --- .gitattributes | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..dfdb8b77 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.sh text eol=lf From 0e1eb394ba580c247c6fb422e92b90847503ce33 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 22 Apr 2025 13:28:05 -0400 Subject: [PATCH 134/182] DRAFT --- ENV.md | 12 +++- local_database/DTOs.py | 2 +- local_database/DockerInfos.py | 24 +++++-- local_database/classes/DockerClient.py | 83 ++++++++++++++++------ local_database/classes/DockerManager.py | 7 +- local_database/dump_data_sources_schema.py | 5 +- util/helper_functions.py | 10 +++ 7 files changed, 111 insertions(+), 32 deletions(-) diff --git a/ENV.md b/ENV.md index 5292320b..3452ef7c 100644 --- a/ENV.md +++ b/ENV.md @@ -21,4 +21,14 @@ Please ensure these are properly defined in a `.env` file in the root directory. |`PDAP_API_URL`| The URL for the PDAP API| `https://data-sources-v2.pdap.dev/api`| |`DISCORD_WEBHOOK_URL`| The URL for the Discord webhook used for notifications| `abc123` | -[^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions. \ No newline at end of file +[^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions. + +## Data Dumper + +``` +PROD_DATA_SOURCES_HOST=pdap-production-v2-do-user-8463429-0.k.db.ondigitalocean.com # The host of the production Data Sources Database +PROD_DATA_SOURCES_PORT=25060 # The port of the production Data Sources Database +PROD_DATA_SOURCES_USER=dump_user # The username for the production Data Sources Database +PROD_DATA_SOURCES_PASSWORD=GeriatricGeronimoGentrification # The password for the production Data Sources Database +PROD_DATA_SOURCES_DB=pdap_prod_v2_db # The database name for the production Data Sources Database +``` \ No newline at end of file diff --git a/local_database/DTOs.py b/local_database/DTOs.py index c4c5ff80..f222e5ba 100644 --- a/local_database/DTOs.py +++ b/local_database/DTOs.py @@ -11,7 +11,7 @@ class VolumeInfo(BaseModel): def build_volumes(self): return { - get_absolute_path(self.host_path): { + self.host_path: { "bind": self.container_path, "mode": "rw" } diff --git a/local_database/DockerInfos.py b/local_database/DockerInfos.py index aecff2b7..3b1c071b 100644 --- a/local_database/DockerInfos.py +++ b/local_database/DockerInfos.py @@ -1,6 +1,6 @@ from local_database.DTOs import DockerInfo, DockerfileInfo, HealthCheckInfo, VolumeInfo from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME -from util.helper_functions import get_from_env +from util.helper_functions import get_from_env, project_path def get_database_docker_info() -> DockerInfo: @@ -31,10 +31,17 @@ def get_data_sources_data_dumper_info() -> DockerInfo: return DockerInfo( dockerfile_info=DockerfileInfo( image_tag="datadumper", - dockerfile_directory="DataDumper" + dockerfile_directory=str(project_path( + "local_database", + "DataDumper" + )) ), volume_info=VolumeInfo( - host_path="./DataDumper/dump", + host_path=str(project_path( + "local_database", + "DataDumper", + "dump" + )), container_path="/dump" ), name="datadumper", @@ -60,10 +67,17 @@ def get_source_collector_data_dumper_info() -> DockerInfo: return DockerInfo( dockerfile_info=DockerfileInfo( image_tag="datadumper", - dockerfile_directory="DataDumper" + dockerfile_directory=str(project_path( + "local_database", + "DataDumper" + )) ), volume_info=VolumeInfo( - host_path="./DataDumper/dump", + host_path=str(project_path( + "local_database", + "DataDumper", + "dump" + )), container_path="/dump" ), name="datadumper", diff --git a/local_database/classes/DockerClient.py b/local_database/classes/DockerClient.py index bb452748..bfcc49df 100644 --- a/local_database/classes/DockerClient.py +++ b/local_database/classes/DockerClient.py @@ -2,7 +2,6 @@ from docker.errors import NotFound, APIError from local_database.DTOs import DockerfileInfo, DockerInfo -from local_database.local_db_util import get_absolute_path class DockerClient: @@ -26,42 +25,50 @@ def start_network(self, network_name): self.client.networks.create(network_name, driver="bridge") except APIError as e: # Assume already exists - print(e) + if e.response.status_code != 409: + raise e + print("Network already exists") return self.client.networks.get(network_name) def stop_network(self, network_name): self.client.networks.get(network_name).remove() - def get_image(self, dockerfile_info: DockerfileInfo): + def get_image( + self, + dockerfile_info: DockerfileInfo, + force_rebuild: bool = False + ): if dockerfile_info.dockerfile_directory: # Build image from Dockerfile self.client.images.build( - path=get_absolute_path(dockerfile_info.dockerfile_directory), - tag=dockerfile_info.image_tag + path=dockerfile_info.dockerfile_directory, + tag=dockerfile_info.image_tag, + nocache=force_rebuild, + rm=True # Remove intermediate images ) - else: - # Pull or use existing image + return + + if force_rebuild: + # Even if not from Dockerfile, re-pull to ensure freshness self.client.images.pull(dockerfile_info.image_tag) + return - def run_container( - self, - docker_info: DockerInfo, - network_name: str - ): - print(f"Running container {docker_info.name}") try: - container = self.client.containers.get(docker_info.name) - if container.status == 'running': - print(f"Container '{docker_info.name}' is already running") - return container - print("Restarting container...") - container.start() - return container + self.client.images.get(dockerfile_info.image_tag) + except NotFound: + self.client.images.pull(dockerfile_info.image_tag) + + def get_existing_container(self, docker_info_name: str): + try: + return self.client.containers.get(docker_info_name) except NotFound: - # Container does not exist; proceed to build/pull image and run - pass + return None - self.get_image(docker_info.dockerfile_info) + def create_container(self, docker_info: DockerInfo, network_name: str, force_rebuild: bool = False): + self.get_image( + docker_info.dockerfile_info, + force_rebuild=force_rebuild + ) container = self.client.containers.run( image=docker_info.dockerfile_info.image_tag, @@ -79,3 +86,33 @@ def run_container( healthcheck=docker_info.health_check_info.build_healthcheck() if docker_info.health_check_info is not None else None ) return container + + + def run_container( + self, + docker_info: DockerInfo, + network_name: str, + force_rebuild: bool = False + ): + print(f"Running container {docker_info.name}") + container = self.get_existing_container(docker_info.name) + if container is None: + return self.create_container( + docker_info=docker_info, + network_name=network_name, + force_rebuild=force_rebuild + ) + if force_rebuild: + print("Rebuilding container...") + container.remove(force=True) + return self.create_container( + docker_info=docker_info, + network_name=network_name, + force_rebuild=force_rebuild + ) + if container.status == 'running': + print(f"Container '{docker_info.name}' is already running") + return container + container.start() + return container + diff --git a/local_database/classes/DockerManager.py b/local_database/classes/DockerManager.py index ab43f852..ac294dc1 100644 --- a/local_database/classes/DockerManager.py +++ b/local_database/classes/DockerManager.py @@ -65,8 +65,13 @@ def get_image(self, dockerfile_info: DockerfileInfo): def run_container( self, docker_info: DockerInfo, + force_rebuild: bool = False ) -> DockerContainer: - raw_container = self.client.run_container(docker_info, self.network_name) + raw_container = self.client.run_container( + docker_info, + network_name=self.network_name, + force_rebuild=force_rebuild + ) return DockerContainer(self.client, raw_container) def get_containers(self): diff --git a/local_database/dump_data_sources_schema.py b/local_database/dump_data_sources_schema.py index 49627bc3..65079f53 100644 --- a/local_database/dump_data_sources_schema.py +++ b/local_database/dump_data_sources_schema.py @@ -6,7 +6,10 @@ def main(): docker_manager = DockerManager() data_sources_docker_info = get_data_sources_data_dumper_info() - container = docker_manager.run_container(data_sources_docker_info) + container = docker_manager.run_container( + data_sources_docker_info, + force_rebuild=True + ) try: container.run_command(DUMP_SH_DOCKER_PATH) finally: diff --git a/util/helper_functions.py b/util/helper_functions.py index 7d6c7f8d..deb6830b 100644 --- a/util/helper_functions.py +++ b/util/helper_functions.py @@ -1,10 +1,20 @@ import os from enum import Enum +from pathlib import Path from typing import Type from dotenv import load_dotenv from pydantic import BaseModel +def get_project_root(marker_files=(".project-root",)) -> Path: + current = Path(__file__).resolve() + for parent in [current] + list(current.parents): + if any((parent / marker).exists() for marker in marker_files): + return parent + raise FileNotFoundError("No project root found (missing marker files)") + +def project_path(*parts: str) -> Path: + return get_project_root().joinpath(*parts) def get_enum_values(enum: Type[Enum]): return [item.value for item in enum] From fbb329e6a6bd94cf5dea765badfc9e0a27e81ab1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 15:04:52 -0400 Subject: [PATCH 135/182] DRAFT --- .github/workflows/test_app.yml | 21 ++++--- ENV.md | 8 +-- .../DataDumper/dump/data_sources_db_dump.sql | Bin 0 -> 183850 bytes local_database/classes/DockerClient.py | 2 +- local_database/create_database.py | 57 ++++++++++++++---- local_database/setup.py | 5 +- 6 files changed, 66 insertions(+), 27 deletions(-) create mode 100644 local_database/DataDumper/dump/data_sources_db_dump.sql diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 28a41e29..1dfdd466 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -35,19 +35,19 @@ jobs: --health-retries 5 steps: - - name: Set up FDW - run: | - ./local_database/setup_fdw.sh - env: - POSTGRES_HOST: postgres - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - name: Checkout repository uses: actions/checkout@v4 + + - name: Install PostgreSQL client tools + run: | + apt-get update + apt-get install -y postgresql-client + - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt + python -m local_database.create_database --use-shell - name: Run tests run: | pytest tests/test_automated @@ -55,8 +55,13 @@ jobs: env: POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres - POSTGRES_DB: postgres + POSTGRES_DB: source_collector_test_db POSTGRES_HOST: postgres POSTGRES_PORT: 5432 + DATA_SOURCES_HOST: postgres + DATA_SOURCES_PORT: 5432 + DATA_SOURCES_USER: postgres + DATA_SOURCES_PASSWORD: postgres + DATA_SOURCES_DB: test_data_sources_db GOOGLE_API_KEY: TEST GOOGLE_CSE_ID: TEST diff --git a/ENV.md b/ENV.md index 3452ef7c..f145e20e 100644 --- a/ENV.md +++ b/ENV.md @@ -26,9 +26,9 @@ Please ensure these are properly defined in a `.env` file in the root directory. ## Data Dumper ``` -PROD_DATA_SOURCES_HOST=pdap-production-v2-do-user-8463429-0.k.db.ondigitalocean.com # The host of the production Data Sources Database -PROD_DATA_SOURCES_PORT=25060 # The port of the production Data Sources Database +PROD_DATA_SOURCES_HOST=127.0.0.1 # The host of the production Data Sources Database +PROD_DATA_SOURCES_PORT=1234 # The port of the production Data Sources Database PROD_DATA_SOURCES_USER=dump_user # The username for the production Data Sources Database -PROD_DATA_SOURCES_PASSWORD=GeriatricGeronimoGentrification # The password for the production Data Sources Database -PROD_DATA_SOURCES_DB=pdap_prod_v2_db # The database name for the production Data Sources Database +PROD_DATA_SOURCES_PASSWORD=password # The password for the production Data Sources Database +PROD_DATA_SOURCES_DB=db_name # The database name for the production Data Sources Database ``` \ No newline at end of file diff --git a/local_database/DataDumper/dump/data_sources_db_dump.sql b/local_database/DataDumper/dump/data_sources_db_dump.sql new file mode 100644 index 0000000000000000000000000000000000000000..aa27b60a2bd866ef4228dfc2af49a72617b141a3 GIT binary patch literal 183850 zcmeFa37BNrRUQ~ZLZAkTMF_D8UbkAREU7w`5xJMtg6zzw>XfoFOPN_+Y6%gFihP+F zs>p~`ELELC0)fC__FXUxi($an*y90@hw)%+gIUG{9%jtgX7d6zn0? z=|AtG|M1U8;J@cb&GW{2zjxHQS8E*IseCY=RhQ?hwMwFXOXjxHJMCsS;pdN4=g&`$ zPO7zqS644reuLWjrWfNs{6l{&RK$Nj6u;MZH}|%-cdu8hKR$@Q{bc*@2K59Nghuw-8#IscImYbF&)wWa8Hk$ok7Z9aT&YQ#2OIMOh_&Azpp4d33SGe1si6@);^^L=NqFyA9#3i@4 z51&j9>o;!h&}WY)!}RRD(;TJ?iIN^I6Uolr=EhFFadUs~sqL-$K3;U%cUtl>eU^3` zw+`kZK#!3GRX&A!w7+*V+IkqB%7>ubQLlLf-bdvulQ_a>aba~`;8UwDtgmnl;MA~x za#nc(-q+uJSl>O^-rE(xycUlnF#1iheJ$DDJ51_t-aa@yAU;!RfzQF_ll2=L$@%C` zr`-Z!ABXZxSXNqoAErp?1RC6-A@jF1tIYxYQD(PQcbKrvy&E^`yN5i`D=Cp89wd9a z=)ELVl}NS^l1sz%nPD<$pS3&9etUS4oTbgdsGpu;=qDI$NxOTLK9dL;CM|HMIY^I^ zUN^zGJLxyi26(jK0U?g-+v7IcWyg{P+OB1+xL%2I{o z5}Q!x(N3?`?5IC5lMgkokJF>Hk9W-PSD-bY z-(Fi&1M|HxU~0A16(Q}S(E!&!KNk@y;6l;vwqOK+O2Sux&}rUJQZi&(tZvm~#femp z4FK66s@J3L-7dyX-ss3WJ~d1yF_@~$>q~5FG|yqA-fMPXi8Y6#fg?Qcia=!-a^Vtn zW&rflcciUh_V;<(C6iP^(oNH&LDD|LoYg*V!v-UeS(+XRN|W9A)EJ#dFNV)EZp)m@$$ZWy@P{k2u!spBu!E)R#ux2v9- z{#VE7uP#)@0#8^CA>zLVPL;qQG77kqZ?y+-d9_Y4HIa=%PC7;SM!$~~1GV4Xu* z+#}JGi_hz6PcEm`lg)nnEKP1SY>jlJP%+E*WLbh~7Ty)wDc>kdyl7w*UVsZfNv z9~zgPUWUj4d@Os%$1rQtK595=!>y@%hZpE{KiTfy>vitIggby@8k&G^!MeQAR3&E3 zf)E;psI%SD-4I+EpfR?VQ2B9B(ZndVQzCs6RP~#!4B&%d?_5`MBRy(2lP5+U7#Hn~ zafkh8yGzV@W%6X3+_QG=X@Ah`cGC{pCdtsXQCnz=KSt3FUdALhU^}ahK_b}7cR}84 zb~=LuE2-ol?cW1fs=!7oT;SFqO?t=4HClUlm65Ro0$7SjYI@zHQA;cFr&4%bI%#rq z)E|u688A?8{b8~R_n0Q=VAQ{tUg$rswbRZKBWePAyV=FcCb`L|=x=wxE$X2$3F~*8 z&3dbYb>n8n@U<~;IHC?pyFV$lVx1pu?<1EYcLU`RsZGM zvPyvO!>@AGZ(?PvJW9gz3q)q#gg`3 zKSzz*_)N`^Zx6Z{=^=ZO?ACm*@RyM$&wPxg=gBV0?%{`^`=G3`Tp1)#d&yB7TB?sf zX*1>oDgt8;zhU8z-c&(lk?Mx$hFgsG-G zgmM&hz*RvmOi{`GUjHur))kd#!E&lWzfd~uCW8_CKaZjAE|Qj9QXR3E9M6v1(1Nl* z`x1ssCS!H#FNi9j)=E)HatBK}KqCoB`=W!UJSu^9>IGPE&7bxOeJQ`Rj6#3W zJ+_gl_GWTFy_3jFB!+5CI8|!$X%~wI%@u6z+)V)i>Dx}Pd-B*Y?VmvkdVNR)(Q!N) zVhaJ?XHjXTe)lD17&cC8K?07US;2aU3Yt7)BY~QM7KOQx9QS)?A-MFQ#^i@V?Kjj{ zYvwzUJ6je^sCnr7rG1V+x(aA2hwHu0vWCYv&)bwok_JV)II` zy0Eguo^Fh3`Sb3Wx~{IQtyf-xuO#-3QD688Ltd%(pq}^c8MG+)wYE}C`1Xb-PpaYhwwczkIZXJ7f z1sd0?A*gue7bHt0XD1kmT_r_QuS(z{syone_zX&z26}o|3_zC^&WC4Yz=|&o?%<=1 zym16)YJyP z^#{Epo~dEr{>6MyFX>}KU;SNuth&Gtw6Xa5#C>v2Xy;zHBbVO6Ef=5osYlbFXe~w1 zS8Hm6k>VDX+4~?vC17wsi8{Wlw}ipAinur*!{KlQ4K(AnVuE zAUr|gE&L-?1DxC((p0Nf*K4edVXKgDE^Rc=+l{;F#oT3%4Y_t}ck@sO@4OFHNV7$~ zP@xEZ`DDL-cx!+60C7#v3^@p4XJhyJt&QslU+A2l4Bpw{hlpEv^wEc&s9)dS<-bL# z1obuG-GzDA7>(LTbBI-_?{2;Jp+_G@(1Lc57x!Xn=&nbhTdFN^I1)TFka@(T(RO@8 z`BZ}dzKTp05f(tdd)AA1aDF?9`AN*5M}>KWTOw26L6I*e&mNTv*Z zB&Z`^g+`*D7S0N^f3hC7}h(B3;nHZ_H9Qmyl$gl1L{X&*7sW}l^Me2E?c&m-51Y?C- zp>RAY!5Hlg>2qk%%0H;EejMNDD=fd7aphO*epAnoQ>7tY`&KuTIpY=NoRy8zHKtB+ zSR!VQg#wQF)J)BguZ}UKwp6PkXk0TyT9ejJcSGz$W0}@gkvBm;tz%x#=Tqt0$7LVO zzvf{=KNQak?XH{#(s-ujCVYFz;gfYS)$_ObkI5kWi7BidPR z`z0IrD%4UJ(~kB}p?R33SFhULL8!R)NV@MXiy4fX-P=!?K>D42M0ln6JxO$jhbgWVly0&+xq|0jLaz>fQ5^pFf@RS(&a+%C!F%ws?{LW%a)F@S` zC2F`@-J!Re8wX+@)1S&;Fq`a#I1O1S;4_OGeI-2S`s)KJ6K@Po$El{uBqMpZ^simT(CKu}VHI*#+|Rp)J{z=+9EVs<0+(9>O7 zc<3$tC@tjFok#`dJ}zB8x97T+%l<^+X(h=UB@Dhu29%O4qs%@*YNB=0r7hadIpA=w zEO_tgRd=+CY79G2RW#nvyEe|ylU?`8oWaPu78`lhrRo|-4$6@ymJXT48t@am5Ow8S zBW@+*j-<@qC|*WD)5G6nt|7Sf2^J8}!d6u5h=8eSS*P1~&Kh;UiLIGa->rWYNc|Th;74yuAh$c@JybL@sz= zmGGKYP?k1Zr*lNfWgNKMyQ%BvT49&UYCrAug3`vFQM+^0=-r{9`nh}a@TOh4H;?k8 z)K<$>o-8j_!Q<+e#}Y1O^)N~5I|o@~pDI2PkhuoHmB! zu5Gli>xa0E;3BMrbD$Bm7pOpw2Ri9j!m&qisULq-h&KpKpuvpze1Ck1FKB_A{{_`C z>^1OVULE)XABMj0MjbZ{HT@*qxd`kMVVQ{zzJw={VjEJN)fu+&57tIWzjq&pCK0WX zAo7gY5~Qs=y}{5QLDs@h#(V~B2L+u05yo5ZKG8~YHivdEU%kpAXBg6|82xLSA3_e` z{X0qKG$|a}c_k5&(ruo>J=#P(4;`&x4=J@xHndL!Rj*b>mn0Ea$V9`TVx0Tk-MCS| z!mCDdv}PojCO=i7e&o;tNdZzaR)|%r#R}TC1kfk!B@{%GL$EVC>kj;G3zd|u%jGh7 zYWNi$=D&vI+Kj_eC}W&iLj6u9rMx~am7SjUGAyr<|67g?SLMl?=wmrFyo$2IWf%k= z9NcZ66VfLXeAPZq?(~MI%v#w3oenm7-o((@y|uGLTlqLBj-$jh114gY8Q61@>@JNA zqXwL9-Nh+uu7pf`LA*>O(EA`8e-Z|*KYTX215A_vqoVK<5mLeD3=^+Y z1XMsMjoP4qmHZ*_#S_6`jgUa223Ks1S5j=^WKsX7mKtiXq^wAwJsm9kE+uqDY55?^ z%pXie|~Ljb+sJUBGj&d24_B@M(<7YumeZVMzJWE!Xzz2Tvw9a87N1dt+z& zE%mMBsqOl0ME?67H)A1I&|MbhF*`f89B<`dUu92{LW=@Inx}~RMz#wiy+ABL82Rrk zfqba02ZFobkcs+IA?kkl88~A%b1+}1P_&qjx$?+HK>x0>KsPFTz^Ln%OxTylfL&KG z1KhWxxG&)_7U^yooY-vl#VVgA9Z^6~p^Y;nayxEg+7s*Lpw;se85>;1Jyz8dn?Gd0 zZfx%hzXF2sb~m4H>}(tyHf|tc-L-9~qsJ3@tfw)=Ax(Iu&(7KS3{SFKid9pQGqsj_ zW0Z4iFjHS{aL&$7(h#VVsS@PZOHgmbr>FRpQ{lv1QXv%X9vhZWTS$_&ji^jvwoxR3 z`;lc*62F}f44T1g%v$?ov>tBXs2?0|+>lOOWss3w=Wt$x(p)WKd=4&Cv-_PfcGniF z)k>1FTZG*j)cP z(!r}muWGgHD>u8~!%{0q9x2iRF3+JLLTHrobIOM5*ChuTe<7B{vY?J}&Hg3_t6u|o zz~IA`PI*VI%CQ*@8jZ7IY8m*{xD2c;a6TN_xFnFat4HBYuq%c9cj}mm_j|NmA^Jwy znJ#5VaHinAppNu#R$XbP%fjT)%t{Nr9OgG!l|e5>81kuk{b>8*CEta&hucdZC-mTjBtsX_l~G%9|i14HJ_ zg_6Z-NXj3qmdu>am92A;Ft-nnh}5V2RJkbB_cF4Jb{i9*K*>aKl>1(@e?j|7YQska z5T4?&Fl7NHuZaJ#_{PhMq#E>J*p(x7adOl{p58P;N`qE=fOvLMWd|JzzrhJ)O0U}B z6RuwGMQOd%!ZDy&r{yY)*5azW<|DhKXE^-W7A^|pEkF*c6uw0`Z8mQ09E#l04s1BN zhvWQ67EXy8uqR!njPAAUhqXXV^ukIynJF30yXeT6>V{{OJ6#eP8|myUJgwNhjse5X zYV#lIWU$j7`*tw817AI)hf-^ErkEH<$h&jVkWSv|i2+}+gODNy!l6=kTq;%R;dCD> z&@Rb1K*V}SzO)5_3F!43%0DkV!3m=%LW;F=aM1bsARF9pfSrdkzgW89X!E_qUypALFL96zb^~)-AXr7v&q2W(z*7IAnrDJv^dm598TgZZgk!?o(l zb7=k|H?jEn+d+ir5`fLUom)3{b?4_vMfy1LDc3Ati!7w%M-t9u87(?v&$?D73J}Aa zwjVRflBb-hpZF_++E=9IfULbX|krvBDGVh}`@1VQ|zO9P}t z5i<)U1erP1;nrDG|967aKOCUmluMCs%}WOU<)t9-D$+RcWCI|~FL)6mMMCWAV}AvQ zfT7vu()p8ld0bMRmAsOijk@ht`@Gq?JZr4}ZV)Sa=t>GEZeKterO=@{A8VRRG%E0W zdpJ6R71HB;bTH9;IZ@8qSyTJn<*DtS_|(3=9JR1xG_^?4@qr`v?*+*XsR7f%^6BOi z?uE+FtV099CL8V0N^!pr6(~OxO5pq)xd16hf?1RQKL*LCb(=e|1!p4auiU(6RH2C%iVtQajLU&OmS~Ul zSrh#af<%V|-n8z0o)1g$+^mDirj+0~p$lb>qw)}X3dYd9?q@07xFAwNlQHlZV+Y%8 z)>!;cK`iL516Kf?g^LfzJ4(UfR8x!y?+JoHz7!V(J%FVwTPrai zlD}CBlH5wOk)rm`)Zn1D5otu*gyO(A3~H??933)i#J)F(SQ0?YU0nJgaaR4pdBd$o z<099rq%-9=m|2Yx#ilfL3#}x`o_t_a$v+HY@@N2)&_a4rTtaneY}7Ri&@s}=W}%Tw z_xqFM8QN;qqn!pmD4$tsM0piyM3gteT%W4x7D-3R`$4d% z3LYdb|0swH-RS0N9m_}JgVHVqid0$3)1r(|!+k`VTQWJIC@vV^2^nLu(5yw{e+feH z(E$iTHrJapR(bdr*JVEksht z$g-bA1sgjE@Jhm77x7L?3~1fh+DZ_JQQt>U*cOFPDu))L$8g&l&UPWHOayPLh7vjv zV!1kDXviLk9jwER{X=mjmx#4Q$iRM`E-#}7#G~E426x~+W2Fy!TI`Q^X)u13Z@#E|4RSv)oFpFg}I!pcz z>YIW~4`u`zU4FK+%<@Orw7|s~3A~*B8Mv_4otgO`2T^*MNzOLyd=?>_ppO)1?#+;rcD1>K7p#cU zx`}&e5Gs)%M+5e0$f=8XBDnPwGZ_#rq$C}EL_5-&2b{-#Z1T+A?~?dAgW{Xt-^lm#xhmr40j2oyY|^!wX_I(5iNy4|z+Iz4h!XtZFvYA~FmS-=BT?gZ%r|OyD(3&Nc09r*;b-lx zaf`h0+PG-{Kv0K}>oX#|Y7EdFlDf}(w|CjaC1M+!cqJrUe^3hTL{(#Xzyh;m`$*C+ z=zTug6AhFDp|-%^Wx+=O9us-tkD8T{Ho9F==DV zN*dkKnIE(_w}^aHDxi)1PA^B-0@kiwwJ#7ELGjQv2Zxu(^kc1tr0C==gw2jTNt7SQ zh2Rp(hL#&yiJ}#T&X>Vk+X_PGsX|7q6S9;%K9CwjxunQ4_Nd|Pd_n0>XbCTJ$%s-a zwS@Uhdf@34yuA>j*gAS*$9&@2@i~X1G`xI{W=KuOOu1mJRvoTB66b1lo$rFpT2`Z* zLYLLx6@!{LP(_Y=^+K`8E6!8Q167-3GTeetm@rzF;082@QdQ<56==x0!M&y-dZ zI8sv9uyXXO@$_N#1y)Q`slatt1^x$gOZ@mMaD_KRI7vDdjqbpIb!_0%#R;?k6QipwI`gh*H zRfm0ZzEj(FSwG>l3hp{kd^0EQSM~#KxlnXVVDL+{d~waVQd*Dyv!JME;l-MjUi9P_ zqqUfOLX~189~YWp-2_ca>qG%}lY@)ZS7uQ0iM}ZIN|NzMgcCDH+IvB4n@vP<&5gPK z=Rr`6h+@_Ac@kBBiTSxUH1^8KDl1k&DZ)c>LF2Z8&`oiVA}xjvhS)G^Vo!JO+V;(Z z%ZRQ-h-KVW!IzLsxqkEz`N7hNgk!9)l|qEaG%c>LQI>c!$eKZ$*7xF3^+Q3b=%Hkp z<`vY0&*pcQB8oH<(X(D;Y5j(YqkIR>o!zI^*;9wW+ZY8Y91;%k-RXxm1pvEeZ|?;Rl-P=^p6JOHY{vy&EOn@ zH-A=Q%jN3gvIrGZJd7`v>3*evJ6IJ=EdQ^!<-M_Ud*kT?x`nVzn3$^6bXm^OM>%Df zmm-zOBzq#l zha_Q+?`e9U(rZD!zpS)+9luXwvQ?*YIJuW;({+gA$7Av0dWFL1N_Hzgz`hfeL_GTA z)&1aIRdsYV=&Bm8qdtpD`mtP(|Njxx^b>F;X9wVOKy%p1H6GEP2drfAsoi7lwb!qS zi}dOur^~f^#z)W1+ua(RgvM1_#)T?YFlL#$iu9GxJ5fdD9FCu*IuhPBa zW=##2vGMT72dcsksi4x1v-uo0*D4ftRE9Niwe{_&p^Vy!$d5V2KT!Aj6X<8L&dTql z&yZ3IYjI)G*}wSlpu!rL89CaDPO_BHR{7-StEx!ZDfQH%mssLIf)w#%PrV4jU$64j z;HCh3zI;l+P-H_Xhkn%QP5DOnlve{9kOI)OKVKPjWX>c1gxnN%o~lz4m$s3uR2E1*G^z6725f`o;W$@~kZ+)Af2$|K+@!jCOhl`}m#&Q0v z59bYC(R~ugJ2Y2YUX~G>T8z-t8B~3U&7(etab_LCB#3;f@^PbsDXOCn#awD5pTcpw ziy*7UsCyT?rd^-)R#8CivV=yFx;MWaJavL%aXwvd?Rw!Cpi1E`Nm{!u9J38 zhNpA<>T>crByF$aeo)$=c^CsI-;y5qx-A=Y@ySNm%L^+@;+HBes^?LFBLO$U~cnd=R4INj!29# z!dG-WMAdPY4x3cTv_1@XqWD)qj9$v5bNH!6X)x&Y%*M&yP_cT^Z^dwS2mOaK2Fq@RvlNIGl+GIj}t;yZNyMtL;c${3rybJ~Z+ zCYYEA=zJ@JaCIgfb+oge9F5q8Ox`UiSmN9#Pb0Gylb;O2L{T9x>()6=i9ci24Vn0KmGQx#8NrPPtF6B*+n@E1fq26NBoHbsgrv7A$xOzulA4IXR{D6>8s+~9f)Q21w(09* zk$YIg78<)Mh5|~H4Qn}1-ML6^Zf)Eoy?l%`6&%)LOiike*pGRp*X^B69n#^d+q z(A0;kyb0#RV&2Q@>O!?bnWS{jX_eSOab2 z=t8m3LL@+%GY)-G`@Cs@b82T4ONM)xayX^6<)QZTLDW{uqh{$G=p`Sv`_sVYR?oN@ z$$8_0an?xv`yf(RCqznUXCJKh%?vE5|4GN)@)CK@iMN;4+QuBJvj*;^(Rp|co5dds zg3YlpXr#}a!#$cFu@m?y+!`oOiaeIxq)HE!8$DbW(LxX5uEL!u z{aAyh3^V;k(Ji+Y)bo?}ygL$OuY_b#APL{NIw!ZabdzLjfhQbklI+y49VTzs+ujYc z(`u7u5EMLVQ!ekhIc6)aFQF&%X6>+{E?=gd3z{d{$|;r?-?m}9h5y5YG;)Bnw#Z4^ zYm@$qt&ZoSmo@j_G3xWSO4R0wYfL;89fzw63w$Y+iDJNzjc??CGij~xX}EHAKKQsJ zeF2p|h!zq|-z<6z|-}Ih(s_A1F5Eetm7lz2@LQ zfdKKtQeDHi?S|sN@RBS6&e~{E>m0 z@RP5(9EG)z!{fyYZ3>m(aa>z{8!9NFts)f4dLO*1+CVo8RaJf`eQuQ0Q_BuV{!>s- zjme1|C3OK>iB?75a~+11L2f59*Y}C>7M1{ z0NA1%3Po}T8M~<#l)UpLo{5e~&&?5nI6Fv0)#x!FaMhv;05fr9sW|WThlp(D_@)Rm zN`)@?+R$Om{5=dqf zDHikY>x>%VfiUADp1D51n4o4Cdr-Vy>?<9$r|y&Ft;5*YQijE7yn|H*szw1`e%xsy z`ZxlSD-?|$_M)P9T=Lg)ILjq+tm4QW=+TiOx!P)_gMk-#%_-=tV0Pn)4rK4$%_5nN z_f7yiU;lw2AcK>X4sagpO2@nu)9|HY7FV037I7k~L$xaire`W@eAqk@QQqo-tW#`r z*{`xDoW-Sc$!B9K-6Xotd3$ud@V_H^MD7KB1K=}u&jWkGlG^NU&|SKN z*$jyjU(A}i{~V;wxc|$d&MA?gEA9PSsX zic!ce!b9k^3k0R+RR-`8_Rm^FoNYjiB=8a4n)OKdB!g!+mU8<%Pqki@~W2 z>wb6&`&k3wfHZ3e{%a5d_b7+MQf1a+C)vyMlbf5v(kxQcvdW4j#Q2tjLLZo9{7MiO zH|)=W#aLO*02iY+2IjWDITGW<(jQ19E`wzC{gI34 zOioIV7rQ3L<3O)T19K*4LsNxj4Z^PlK}aHML-hiS_rP!|uQJSfq`DAm@4s&He?3Tk zY%n?VE?BO{@}S>uD^945`Np$n>JoN{ds2^w85mIx;j zM^r1vk_e}!(+e|KslOG(FlL1Cs4#q3K4|49V7D$Yw6G8-jlwgGXh=#FVIYjuz!00Y zWc_v!m4_peB}RTjc#qZgdc6;wn&`UI_kAXbO>K~kLh5fqiJUi#P0@yh}mJvJj~p!J|L&1 zWx=nFfss~vcrB;O%zhd7dt)KtJB@uL#t{wAIZeT@`^smXXrFaJB;@2fbG`ZdK|qp- zhA_S5J`}~L`jkm))+5G=7Xci?V1`LH-Szz+1aXMDbBrAUABIusv0+r>Y76-ar^=x* z1s3*#Wwv*6drOo*4B`>fo!Rz>563%7!;#g(Xk_Low1mi|$mD&`94^bhpt5>9a}?p`yAOt__j#$aJe?!xQMWIwQ(_B2(CsEYa@g ztkL?jAX+g&qFywNMT^*D^)dRQ(inwn6v!MMwPiLK6CG2Q%G(cuOjEb1aDSoKWXLSGC`~m+hNPo<# z%Yq+#@hHCl%xYoU;c%v8f>Mzf5-^v}pUhu@8oB!j{ww(Z$mLl}z+VLsi0yK6N<|_0i>0 zH*VCC^$>>!Wd$lukLG&?ky($BoqoT0K@y{s2epti?;)uL-*p>UKhJtBeNYgISbzt` zBl$w|_RP>Br5^fGq-rcR{hx>24VM2W71451VX5A^Z*!qRFZ zRA1o?LoxUmOFau09##%`vzDko0A1qedHAQQ517`f)ur{y$K#o?3>h@kaerFwyC;Qy z8Xp%$=vx)Y^Fea+=9rf&-BvVCo`G#ODbCu7#%RCpi>qj~r&SG0Z8Y?gwvdE4 zAb~5T??r`<|eau1YzXb_(&@Cl4_i?VrF=6uAyoU18eDdkrzM!ptp zgouS_OgJ$_pK#-&SeG2Zu)4UC+)vZHor`IgB_4J!2x51o40gsi^kIANWUv`^INRC#%2vTXUf(&HSlQJwW~K>8S_`&p+CJ$fchd`)t;dM;?Y2mZv3kpH z7&ArAaMG&f(oC)z^8kGjt%~`_ot#&fRx31E4Fc`S**w3GNv$zR-^sT$tFzrAh0qJo zKr9=CTnJhMXlI5n8mD=4CRrjvtp;d$mSv6bIQRRKZ(_!`%&OWtpLH}`5Z+nGZ$;^l zC1sW=I$Rw-1j#ovXv;>JN^p{GR7(pK%R4Umi7qdZM zZ@i0;j3h%eOR{R%qWdM_)6mXzcA}8WLwrh6X5=d#^jd9Th7(Pv=~==~0C*3ZxX&5t zma&{Zgm=l!L-N8P4#sNNBnM}AeJpZUyH06ORYtF#76m!+{IqkgOY)0?BpdD?tJ5U# zKGC_c%3gUU+dR)>P)7;(#a6-WuWInP{SiUZjpz`o>&6K75pafpZED1-VIZ#MOpU5Z zi~*Rg223~;)4`?Blpb`}^6-&kfnq2v9~8RM$v)lakVO#MjF0I89g0rV?B{!hQs85;-lMiiHRorN;#M_F8s z*iqCk?C=)-IOMp}FI5zf9aN%U7X?4)4_Owym;(E?loz>ut=~nqef4{}V}W>zbf6z* zn}Pa~@~GLVEC=(3E`%v0KJO33IZxcCXTGMnCaW(y3Z1n}TLtt1XL^j!QB% zr21Dq|NsAWPk}Ejd&j}mDe#)S3d}r2?zjypW0UytR*is~wxg{HA-XvHclfekacHjx zXT1g#J$sfd^}N8<=SAUOTwkud03ddHu$GV`#9Qb66^e$|dAdNh$P@VroRu5zOG~Gb zDkD;)*tsKU|A#lagSGGl;k^1YFB7EtI>=f>t3va+ac!Pza?pTBsdZOhoi{hf9ebQ| z$)n*cxrHjgeY~7(>~1A=j;lhf`Otho8{J~~J=DMV@i ztL6v6_4Awf!^n<+S(J(QQ;ZXlDV)@r$|+X@9(^wk(nl)*k3JegK5KKMLoy0^{83=U zfrvU3o)B^O9M(JPL$#5?+UE+`KI#s!p+T=nu}r(A;gSC_LGoz}(<5IgL?42Ka!ghg zc)b>OU@OX3)*eY}KU{vxj4T4NU~Ae=_Wa zyMI5j$oC7HOZ?6`)FB=u=LqL9YS)XT&XN4!$g2q7cxxHmyQFh27BXhOd*khqSisDo zem&kEM|f%^O)&px3U`qu6?!sIq~cJ$>5&jvKILCJ9FKYcuQO4%<4 zBAdB3)w36pvH~(TAVngAYZd5RtG|Ty#zMfBE4P4NS&(-kWdAC&PQ7rn6JLEfV+XPhZQnV){(; z?7Wj+gi+R^?c<=GvMc z8dcXT)f$IAAGHTKoX{G|Eo5?!4)`vE8yko9{q2pN?YGpol1zEOKU*w%U#O&$p*l@f zzO{XDxV?)jVERMcS2fr1{yS&72Ie!z?d7Xi`40Nz(&=z`KDhdt*I-3NPcB~(kL)M+ z@86#{KF}Z2EJEj+3}3JrbDXtDA4)%IC-QO7i)q&8xYzVN6o|6GyW3CJ_v=ZFeiCoB zroK#HZy!F12g%mP;RZColF|TUamIJ3p&Ib16_ zSC?4ez(&5_v;hw)jRnt7@ll!RS<5}Ip!8Std-ut0;NG++9G?{G7=J6fAiNloQ`R%Z z1yPxLg=@o+h3goXru%t4LwnwIqh0wc47>!YymF^(TmH)3!(>7(3k|=z?r`(tf|~uM z=!)Qs*Sv544)L+E_f^1AUgu%u4yA6M@oe(JGu0ZM31G=d1hABXf}+DjBJ;A1x?85;u}F-k?j;GsMtR3R(^A_SVeNdsz@Y zUP|kP|D56dQ?LdX(J?j*#_5)CCX`!w zc%%7~gYsC2Z_W)AZv#Ey8z}N{_U@eUK9*DRR12>q_wdi<*uo3pFl!YulxXxdIA8(3 zJHuD@!Fb2UkZ_iLG5!p0&V`F-TXkWxS4K!g&0@6x4w=SmZjwIs&CLigzBHjUe)5+V zj;%SD1LbOk4sl0X2Mv)i`*kQ38MFCyJ@`?(jIISoZGOjmq|1%lb$6ZiDZz1D0_B{M z+XPMFk(*DHH+CbR!|yxQ=v{a1`u~OwiywEASgbAZ74_T`(@PL0KbvSahYJ55C#*UKEY3 zDGMrN%VmQNf4Nz#BCKh%m^iI27!aY>{j`Ii0?wh^#A1q5oJ_mETz2=z9}WU!+%sQ5 zRB!{CMxz0ey{;;*kbyGRobY#!hP}t&L+q!N-4Sq2x8vra@JJ8^B%B&}pjLaCEkBQBW)iuQ3*1+omA_B)+0!MUSf@T6ySOs-s%L-29M2eW9 zE=d_pdr+sK?Eqb~HnLfL0^thlwXwbRo}m9F)x|&Fv{PN@{hcD3qZZFMquM-+$KcJt zp30;Z_24a^$I->OsF298MHDYKt!1?A z9)|hYxVk~OC@t?8B(;oiWXV0N|F!r&y10qdk=aDSFhW%-WbPO?Q8p#06}aA5;ZePS zo<){9ss)EOU;a=9e#TnsJQ|ce^5Pg4QKSR@-1aRfXUz1zeHW>TC$k+aFyRQr&OHZ> zl5d9OOxrqF5+s&GR9$<($L-IW-HXRy2hhLx2F4MEnx3}Lk<%<4;`q48WIDXxOUOtV zPJcFSb!T&s$luc`~#amXWKp3_THTxX5kr66*&lqw@@ ztgtf`T#NI?`8#Ir__EK2d?8ar!C;+p6@DUnu5z*7BvR2s8X6)leH zw#`(}G~L*_z47z`#R6<^A!FJ0;nQ*s(vH>L4P>}i5%A=gi!@}~QZsb+PHBgFYT=Wj ztXaohfsX=UW)05yP$1qQrTi;MHOgu6FkR3wZm4&6vi}a0i%#|hbrnqdcIl{d)9zyM zq&Y*mhMq_jMU+fs6VD(3Yao` zJyX!&vs2%AEPG{)WjM6An7Lbn#L~vNr)s{5WIg6?jaQRtnegU`U9w2cP|v`C@cJUh z>1wxvzi@952O`ZzySyR?>{{kzi1#8sG>V)Ig>`2grVYbQft zM>t|m+aG(kcM+ud#M5HSu)xVsSa;aT5HB{ZlVRO;GJHr(M61iywaOq_(JrHy;aG}7!7VZVrkLb{+bqG(^86g#yUIDyb1;dU1{8J`vzo(VR$weY z@pQ6PzqWB}=WwirdAKdv?`PTmkSDGm6y?q0TCMUBASS86sn)>i`(KklhD>5zqF~me zSjKWu?c%6RX#Nv3q;qy(^tVCvyC?YRnLP86Jp5Uom0Wtol~3Do*e@00Xc`VV9hFuy|GC7y5{X!jBZ-LM&*mk+!#a7F(H9u|0u*cA9kX|yc zeTM2W?Gt@el4xB83;Xt#qFX$t{iQ4?vBpBCaho-(#fCjP&PKF(8fkH@%4@5P*YQd1 zCs95$sp)Z0Fo%ulSk~1(JEtp8zJ7!7Pi?$A4hGr{WKu%Wj zyzKy&b|bD=Z{do0M!e*)#-|2(ddIrWS=6!1_ho3%pGS0aGnqze!SVJHXj#k(x;1}{ zy8R_l^-10thh#moTpJ;g z4gV#lmj^=Pa*)IqbC3DFS4LuJ!T9!aq+!g`WNECB!<^)_HbR8qt%BQh)*Sudr~$sT zvRa{Rz%*@eQHspJ_ZJ3swz!pSfEa#+@6*&bA_Fpg|5(}0smh-Vqri~(EJ;WzC; zPrTr{mqIaEQn14*Cu#cA`Qn&DIkGS4Pw_=IQ|emeqYI-8##dk`>e1gQlCtiL4+IhF zt?sFKDf_A3CivOhIH7$fnoaAF3K|_IG)mRHnVYsE z;d(`?F@-r+H?7lVA9t7H%<#QtpOTK2YU&^_XNYVmLo^eJ!UpR*2X#(Qme;d7a;AZY zIOu=gX$m$|uSK%RxjD5JuAmnb?bNRwCT5Zq-9M3GCF7K0J#s|qZy@@V_k%g>)ug1u ztw@UrYEs|iR+MqwtxCS(U1Q`s5i&$Fxm9j6|g{?xlnW&1y=UaGDNHB)s@lC1`{#lt|ByOf8euQ!}z0A+~^ zY7p`yq4hhMabX)Pc)j2^wkA0|PKd~Kdk?E_oC;+#5kIa*ScNl}RtbcV$@2!utPmnc zT>lBaj)-fdf{?TtpF$Q{WX(sG+)7!VMr3srkXG7q@#fAP`e!bzYe8X6&@C;j1{MR8 z6(2(2izO-F99wyB_JE8CS5_nSln_-asZXJ)<7t@V`0fuDSApr2lIm`iesWAwYYR)& z$_qiQyrs9ndvvZZ=NAZ$z|MnJs9^JfQz(RdJm-MnJ_X1ewNDU_9{S`Sc0_<2!qFSU z-d&1A_gzI<-uk?I`t*)dw|9T;vi3Hy1BvvS??AF>^o15>@I&Lj^qI-^prF!T-Zjshlz$G8RP|(gNwm1 zJ-dReskqmUbP?Q~?cNAq@{Kh~zlY3xy!#|#{|p$Xb;B&Vc?&-+h|^0L0N$@KC^Es%}jCylEUsdgNcgZoz6val6KQRC6|~^jd{dx1c^6N zgqi)%e&-S3OHFR9Lsoh)pZzYz5?LcMotn8nm3_t%F=N`F2+~f`L)x~~f^Oa(=F|NS zd$g$HBb5e@ZjWS<`Q|;S zH-q9<3^8eJ-D5kd{}8W)?Vvv>1GN{-Z!iWLPEss(AD&Mlz=B+oY&S&g6Jmx^bsc$= zDAK_?Mc^OY`3Zay@)bD6vgzB2;>Y(31v1eW$*8$Qq39iMo1KiXf>oxj8SME9t`vDUd=sy|;!ynqRP+3WeG4YHp>A3cPJ& zkzg7Hs6L94G<@=Rb;J{zzhPMqjtNr3+>clX{IT)H_Sh&x1&(WD+FW+avpgTM*e3v(ND%CY2b(|mkvELGQ$bcmu~U6`W#lab}EQ@&tSOm&}9!X*Q@?Md=!}}?Sk1{sT7i(yPj=uZ=i>=)th!J zjdEV!>Zk{6+30oOAJZBArG4Mmg2R~{#CnReY4-;*`v$W0TYjf`2+M%VOxck(KcaBu z7UsMG{A*GliYc$xKWTQ`@8bQ2CLFFf4X<&#hn4cEf8N8@BvVGFExGXp*Mra)v4$Q; zg3H_7#Z9ORhRE=thXeCybt6ivoB$WI2H?pc0Ho^W3eE35Nxm;3=UgPRg%GZULD)rY zDx}?^^?}~m?n=UT5Qs-uWyLREqJ{Tr(X8Dw=BINYC^5RgrX&w>;osOh^lvYCyk}7Lx>9AfKNjL0Mt}aYcalsQWl&q*vyB0)HCjFKOwnS%*=IMfP z>Kvjzv4RL*5N@`mVVzi#or}Q{+aM%dJ$E(j?+F!x|9V=aw#IiMTK$Wf&Ahk}`xTMV zDwr=Sc<{b6ItCu}odw959BgMv|Hh!&pty!Wt9jWk-iRW(8p8xLrY!iIsr7_w_WvTf zCVp&z!GWmiLgmBp3Y+ukP=u*?No~Ft@52^^>}~HZ1mD zhf{i-m4L=T>&IYPkUs` z6NQ^WB&>yTHoWJh{_;_{R~iMS0%@BwmJpuS!}Oi00mS|SFO>E0n6+5^XHX%2{0V49 zS}YkF@g+~L#o|@)0kv4X-uSakxrL4=S#JtTmatg7RwuPsyov%AOZLsQ7K?Xl=YLW1 z;+VK(v8*ypGW+M9<`6kR>9CFKsQg(}5i(o6g3@-8HO2Pk2B*FLaEhQDVW%JvW!R<= z3pRb&&M9O51mb?f{!foMU+S`PSlY(IsQ#Z`E@<`LA?xWgUgWo zj`>KJ9N+F{_I_}DQ^-hu2l9q`6Q#q8ll(fpQSW^|uNW=qXQIcxlA`7YO#O0>$Sv`b zMfQZULU~J;7iBID%hPU_k;9&|N=0|%Wr65>E^WFVX@5ctb$W$y&M|8dy@6d#drw4` zY4tjzv+iVt0$t5n4w-=X0xWni)^g~-=J0KuJ)CBf4T*k4N#uAqW3@bZ^$_)N#=7L= zT23)6y6(P%pkg2oXRPTwl@RxE#%hZx17hM}Ub1p3HL&Qqx_%`tPK&~GAv=qEutuqC z0?@Ij920=cNHrj<35*^wnw6GQu5)MLX1zenB}XFI13P^tZH>xK`=UjPV?eN~^lr)q z&W(#mMvrUg*Sggr)DSi%v;+lCzGR(j^@4(P| zP)W=)VwU6;-$o*MNhgNPgV#;IrdxcTFbm=J71^GeeR!UFxc&k$5nb}4B!kf%ijTzc z%xQN?7hMP3uPRJ@a~7xR=^TzplFDo!b=-axVH0-T#_M_Tnz4#*TNBFRoTzgjcd*_&r9Q zp=T=n2)@YCk+PK3lHxC{&-2w_^!=#c2_0!zSX#?YH45#An|eB-w2#Zx+GVL9yY2IH z@_(v`5DF;w9lkd8zPx4Fm0k}0x+sJHdXO*G_{6{6D6Xg==;LmUuR&G$<5{x7uIa&# z=tXoaZ$t;XA z_bTUyVZDeSTg+-ToPXw^X0f*5I8CG>KXJ{UA4loX&~}Q+u^k<>c`F#l4C1atbc6J> zHt5iScF>aZF}&jwaCy8O^!NkZp~jcK48zhRJjsE6kz*5`R^wy)%Q7aoo2=nF9{i|& zOyO9rLq4MA#`BtM5Z(@sXWADEBaydyc@s*7M{>9>Zwx!1P3ys2bGm5h4Mo1W5kNLa?cc6-A=|d{yAQ+SFE9)J=chN zK-Z$$dwFq@vn5iPMH2#gq>~)6uDOA9Rr}i;JKJwT)+&m%7x$WcU*<;ezEJCPzeY1u zstNTX23?bbn$+*_81>tOqjt+V1kuSdk%})n^VZo43Y$8^wT`X!IUl%C9~mb$h(|Ij zp7bDaiWVC0P3B+X<+5W3j`~VpYl4hV2q#85nYlUql4X`cloU}FPmQOJ6%2HMRASVl z-a^n(Q%Bc?+==`3WUe{iJU{REka?gXT^^4om&79+PrZ6o(PV!_S^92oA3h1IB-z?H z+(7tN4a0>s*chCfOsk2YtrZ!(rT(qTWO=cF)pzSMn-%_L?NF`B2lzldsJi;((Wsc$ z7DX6XCX@>9L7tqy1P#e)rvzHMN>k7~DL6^usosLWLOLOj(M*}?xG6GIdprn%rWWuc zvu|c~Zk9=oiH;kir!ZNi*T_MMw3ujvggND@xv+_G@?pyL5CxXc9&IEJ?IJvK4&5iN zW+%LoOIPTBHgx<*JT-_dq}Btp6xXtz8O)ni4?eIgHB+2Q{VqgW|B=C~ADJldpm2Fc z4|!UU3RDeKQ%O_GSx1X%KfpM8VL~;^+D<1MRr)dKUsd2aqz^Oq^IzdF@xvd~me+Yn zzJkn`%L|Wd&n;~R`xg1QuDmsxSL-1iz@M& za*kvkO9~A1msgC#WP`^U0#655Rv(5|#o9rz$6Hi=Gd_rIy$B{zj9E0{KYwB#Fl~+y zcZR#yF^N#>i6hQOfjCu8z=9+sh)C5QArZ)m3@tQi>%2<*jbP%m%d$oH9O_$wAW_Z$ zxrAbRVAlA1R0lRtc9^Ov#*5EVC)KFLl#5P}y0-?YGjiLBZf4`kr!qGf*DYQm0F#PP z4j$(4>>@Bd(m%Z%>6vcuNq=WCgR|ms(kbkc^Aw1HFCBzwU?8iRHT&Nd#DMl-E#ZzN z{5vZqKa%KA9Y`cz=bl335M}tT+ud1{{ux2iY4^Y)T}496M4rA3yuBC%rw$f`Cy@gX z$92^!rgMNG0Bk$k`f%2CH-dE2LyK-XQTg52o&I?R~d zY*@yh^qXg#W=f{snl;I950Xs#Ko-d~-}v-?Q8B$dYZI~iG<}CCHL{Eud2V>xlG(;O z>AkcwP}l`W@)Y17gEH~sTENw{HQvRb;R*K)?ttjR(;{{9 zfX%tg{Y4>N(q1$3PlU^w&VUgnO=*A4vz`!tq;T#C&Buoey@%~1G8T^8&>U<5<9l*V zvH7OgFY2lX>ct=#@rS-k%OfpGo5GhxeS_l(bu%66wBKdW$`)BMrM`v6)))VWr7Uuv z0(RwTky2CtE=dnj;9s$_S#|eO{!B0x|8@C_G&FK$1HKx9=x#9FMcJ4MZkChQxP&OO zv6r<&&OA5S1Med|iuS^WuGyN8s34T6EWMl14C(^fWXfz_vr%6md?mikGk?v}V$RQU z6LKgVk7gt&#N}XosjyZU8B%;o3yxn{#5kBEB9e@^kV~1sh;qqJsVSxz z%x|MRYK-_;eW{I=RJYQ(5UYkk%R8cSiw8CD(9t?Vnu#>rkcYn)m^oG9?# zm{v?_Fw*d0DEMf7(^zO_44`pUbgbv5pWV?xE~_s%h0Pkh-vY7X$JSEh9a!TCZjz6I z^mPQ1*8WSB4(xPssRF;t_?uj2U!HA#7k9<_+X`4m{}Zy}e#3XQ&l`NXo~!t8qi9I+ z>jF~atH0S-A*SIDoF}*$0b8Rj`2d z;~>%WPs1%72 z84ejPFLv|45Jk;tWp8)qX|;R*ww=!8*Wt~2tZpTBzJ4vY9zlXsUXOV3`|}al5r2qNe=kJO5joU7Nh$O& zh;-K>&P3WMIu5v5u`r&mbDETRX2l)|gzIwCe_cYk?jB>QMWI|;TC7ms6AC^;bT2}W z5EP~|=fh>tMIcv?R>RVH9+K$qa)>r@E?e(7Oek4rm_u1$y%oPyvd#y?zI8DI*5$>8 z>+ZC>5@+A)O68KvKC&e(t~K}sKyZwFTUp-4DxH{dkqX0Hd&=pi99I_CA{9OemQe^L zDXql?ks}HJI$K|tV=Sw#V)~$?0%qR_4NQ5bqdN`U>@Z6CIC+p;U+=&W8)>2XCS_Eh zt}u_@Pt?UfGeVoD)2kA15HMvQN< zIwxgXq`ol6vSYLU{3SUSsqewCOI*sk2*-VzEP1ndBMHUJVUVhz^H4X7hEYBXZbyHQ12LKFq>sUJ`IaTM0$CbiG|ySfX`dA#Okr&hXXMW`bZt&5fE9kMC!0vfPTyZ{&fj^7B%ZAsks>Z)kUmb zX!_^w8k5XGTzM)@@WI&Ae^Gs7vm@TOFk2$i|G52{1$&ld=1Sa&z-GX*d$MmK#-3Vj zc_m{HlC;qRNT2rO)a0BZaWeMg)HgPJ;&qMBo_JjkX7ot*s0-cKUG`jxv8P(a8i~#n zXL||Zk@A_2f;Ojev+Ovq8|92mmoVCoJMGH}!l>ebAiE@2kV|q!>D|xs@;I`!D&Ob_ zS+J882Sxqe4t*xRehuTG?3E4!yyH3FfgkOxzgX_<$``R4)bD7-3O$z8wU~Y zI|K$07Mf@RaJgU@*VjAE+PH>WoQ2B9!F5*Ea7W31of-AxBao64c3DE2C4`#*Qls#A zkY)wPgfuJoK%`Wi;PDdpePRxjRyGisbOy=oMDh~Q=NsjDwxOIkje0C-byAx=+t91S zO6}$t<`LV_tHTCd_hS~4T$8%D=AOuXSq|*0tb1iQ$z;N@H~cpUc%JUHYOt`D)#aMu zW@$4umIHD`_nM_`d{x`CfOk>qUN1YzC5t8WlIy%r)y;dC-z`DXR z5OW`tVJY(LI3U-ZPDn*uru~_~EcLLLN+`*J*-Ee`vr|x(XpwxxR~>olc}b|M#e!N_ zSq5rZgPpp>0XwpuFV}~ez%AGFi>@hrB?340z!p};Sz@}54$&4kuMph{d&UtaQ=*+j z1op@+ShqW6!}H&U)3}smr>xiREMP^LDO(e3QYU*UAztUgyCM&1Ev#XsLC4NwTe6#P zY;5f8?A_jYVyE6XsBi3VKH1o>AKcnGJm3(>%_r-dZ%j7#b`K8sH@0^VIaa3fJ7Fv$ zjc_0k9iPvtNgl~*_>n}kIyX0Gi7rk*a!$C%la1Y{levxk{f(#JI(O+9GIJ!y#Q-A$ zyOjNAi^2+UQ!(A1F3Vj><}P)6>ca*X%lZ6mIFrqJS)Rg4_AP8Ozhx|f(BZ@@F%%_#iv&wBr{Ndz^-cPOm(K_gU8k&kr5)&B3SY3UOM! z%c@Uq?QXy67LKPQTo|~zQ$(m@4^6*TA3}7vQepd>b#^3>xa#^GH>1$4;rKa)z^cX* z^-JR%ht(boYE*M_bAS8B#{SdE8|#pJf!e2`A7VBU4)pO*qgrPMSUm>V!a6ny*U?e7 zoHSUbOHM#|(U3lW{!rRkbO11jX+MYbcuP^)j5hAbX=|cp!;%|qb7H2rTZ7kQOsTG} zRXLwS9I6mgq7wcE6TwoF8@}`#!UK(hV~-+ENDPWkpQtZoSTxmuk=wXcmq*(%9%10D zRhB@Eq|S|kQdGOo<9aA>%tRW=VNtN%ke2$49EL=ijd3H`=ETFrjZJxDoE5m+=jFtT zPI`8y-D=!Jv^AZ-b^ZqQt)RAgM;-)rok7IyPpBCiXxX0zVu)!ry-~h7A8Eg>Ndd08 z``zCi0~kwyb>7J{qRX;h;}Y%e`I{49Rz3u2_Lgh6;S`EwD+k4_xCdbNiff(}JTLI= zMhv{_B4R{HI(gwl)|~pKaoj#1aMm9L;v7Zi($hRbX)%U6gBZ$pelvR3qz>@eJfvz7 zsOHJ0>bH++w53Vmtw|?s%}oWg7l#?nS+e|eFV!z``OLGIRq0%0{kgB$@>3PZQJiKX z^8ovYF~Dj|*wQBic)BhTkXiD(INV>ABgiEy4{@Vp(WoN%tm(Vu@p7C#O*%%r;LY!k zvwCfzvg@)s7QAEiJ@QeU$zM_y%}C9K9FCNWvNz8oRfg1j-sbgYLdN+- zhjMe6goHvL3*K81A$fDrHijL#YmDpk9-qn?-!M1 zZ=~WP-bRW=ZI*k}btjjrd3~9frEVWt6j7}dzbLgr zp<$)S;5dYAPd7x?MlLuve2O&)yf)DO`8c`ZIH&u2o)F5^A2lp}Ne(Vo}#|Uc{tFZ8`X3?d!oHKft!1 zk@}37b39|_#CkI67>S;6`Uf$_;e^v7?*ZETv^v$ah^*vIGz#9(&*S5zpMz1Pkr3M= zp9aQ`m{9YUmyx=>(G&S@sxh?gdPcq=#tNKvT4m1&cf(TF{D(g6RPa8|nmik`pwcLd z;{A-WD$g@g(2X$53i>eVX!1@%ePfh$Ys=LNB}?G$XM-fbx5!I9vdDF57NfsoE|tGm zG}I!EMVT4-zNm9rajNp@G?HwZSu(^Recyx~4zpdg=9~sB>g}ZBuqxvtSKV0kZ;Fcr z&KA9XDzS*#)On7RJSiNT_c5D0Z%;R`1LN^>8k^i>)ZdCSY8^*uXm^P_4*SZgMg#>dDVQ~EDIfPb zo!))q7FL>H48`!fILqEuGz*0*jIt~Id6ZRoi-OJtjJPO_t z(owNZhL-7y6U^u8H?_eN*Zz{`CIZ7i)ti>FZX4P9%%U{Tbp{FMZsOk zq#ZYY351pC=bCM*k3lw#+rdc(1a?<=K*axb2Fh3F@Oq*0aZ*3Iye4rAZdAMJ)69@h zEh_n3j17_}$_Pjf_|puL6*sf^x5r>ZV)}&&WzJTq4QYZKB-URIz9S&IGHvD%YSk0M z(fS~3NDDkSO=^6vY%3n5ktQY;*)`Wu`Wk#-^}z&0wHc}^~iq+(cOBAVywx0)!x zbQ)3h;QZThIG1acg#tK{@U;zR1ay64*t?r{qf&dntUMLN8){Pvu=|o^***7XYMpU# zck3o%XC~I~iSeOIx7@#)`OrT@UPhi*jRY=c4j8h>qAbYU-!ke8Ge9>TYEGk{RU_{9 z1o&>AI2(LjZ9wGpyHJyp;_hXl`Rp9rYn4yR!#y_3<;$BUs}N|^A95;O%N5U#AWaR_Fax>%%qo zKtnGIEnSR!ML_H1NkWJ0Ct?K`lh%`b+U=?$Alo15_OxCgy+oQg&;Ike9YQ+N(8Sun zJ>>Mx9K4r#J??kYdlDDtJpY7OO*yP%#>W0QY%KxxG)9M)mue7)8CP~cl8H5Y#muOQ zfflad0^@DPRCAR%*z$YbiYLMpaG0YiNI*rWT@Q!kuU-Abg6lexMmdPCaBz{?zKtlc zHi+h@m!!22&6torS+aqcd5SjPBmoF&)^hL$jF!_7Y32>;QST2Z{Md9O67rgIPlky4+j|s+3t#O!I74yUH9?%~ot&#T&V?kvH+%ZN((jY-nOxb@ylgP7K>x zty<&7mz&_q`8z)T^A=xDB_Vj7PXby%2yEWs%c;Xd)_Fgb*m_C1-WP+mx`Z2UUWM_D ztIc2uP~tOO5T|~Hy8@ThkSB(31w&|G%)aX9sD!M(Ah!9fX{%Lr1sfQ2Ls>eYnJn*M zd)BOvb$*+ZN;JHTVw28Y`+j~D{zaT~{De6CF}U+K=`y;D;70AXmJm{$w@H^(o(n2! zo*YDH=@K137ozbVQOx&WDWgmmTy*(13>%8CTHV~dT`ARF?+=vA#bxCa~3 z=u`XI_9v;2ysaVCTny%5Ta$u~AQsjfURH~L6a{+;TSVkPwE`&B9zfulG06+-bMYB1 zJ%il%dQ=~C5LorZo6TXCRYTSj%`WRb4Vmm4;vL-qIc4W!thHLsTAh@+>oPH-m&f7< z;>=yERc^Y>Nt4`S%|ZdZ>M8(q|&lHjuh7n7f2JQ`Aabw<%MBce2UfXt_{XO zYeI&{uQ2<^afahwEuHT`9MT+M?l7*k}=(y?$uMn%(pVa$aPY zSuzKXJRW=9nwYDx>$83^%GG6DW=CrRlZ=?0nsO=o9QicJ->)ml-$V|8_LRb2L=%?8O&$~+ciSQ*9M4c1PV!ftY6=4RH*$zBXe9?aY+i|1)&I+B_D2AU;99CE<5n!O-5 z@*)OwRe89p+BM|3gmTOBKa>NqIYSWKPmnwqa zs`X$H)WX$ixPcs)tu{5({FKC6^+w8d9#iuOjHnu`@kx2HBCk{M0@2%1o-D2|a-e4> zDEha!>o9MlUl$J|tp-liU6>c>sjK#d*{XL^_&ibmoG5HHsO(p=Ccq`gSzHtp0^N#i zJcxssX9Bpjgs}{?Wc#kP>}DO3(n|}e*sSO_9zZ+L@}!_6Vpcc^-6-3;G4L1G7gs7T zk>Cr&2U)OdP+7RQ2AW}NL~bza1HZ^B)*3-a(SB+Hh=U+3w$HM-tow~Se;VmD83mEq68;A8|Z$H_u-`v^QtS3)x*Ka4; zRc7G|Hx3@ceKI@s&BLTWn8(DzV;jE`z+^wEzm54uJnN+MhJ#H%qwjEp=z=#T^iy(R zB^w9XtVIvR+h)JtynqMzWUk+u=h!X!&#>hAUZy<0bvC!Qt}Og_=yHjn5Dvx9!g(1qnp0s)Jz`8tO=^v1k!{<) z=w+_9ynI;+r6aPUMk!Rfy&NeP1-v%i`nDWlw!57n@9SY+yYYYW?j~7MH#EwBaj84c zWkvtOlgZy1|B_-_T2(j7ugNQA^}nvd|1C%1BY5fgs9~j6UBV5U6xXcRw#{SQwWCiH zcA;g4Ow-~1_Vw%aeWB1k7bT=Zlh2btxn$$oVSPW@-aV-AAJWi}{L*b*gEp*hY(AOn z@7+%7Z?13Nf`+_yYj^W-JNvrLeNa78yq=?_46TCMK+fw#R+DOXLjFBpcW z{DYB*HH-M~CH!8{G1VhRcysT@jXLgRqSBT3;s5dhAy-h33+(nma%r1!8zjx7o8H%o zJ?S0uAvRpuM48wx9!c(>rnvrqKjQ9OByAKr!fbX)0wn~S1V*$|DD5nrYu@$We~}}F ziNldLaMo!6?EE+A2BaOOt`h`xsusqV`I01Ro9}6Ok zk&v}Mu}sEJwYGyYNk>QWae}aAz4GPqiwSQbsJ>j5Av7r?n>*4-v7}SgJ}de34(k0` zTt0C#7ir0CNh!orI#wdG`8kADNH)!4F~Q858b>Zg&4iQ|%7~J*`jF{J-Ry({S;JzB zl;olp0{V`iK>AA3?U?S7FN_RuOc>pIq>j|pl@i1Q@^@KM$konD1ij4-U6s&WdF)>@epW+({WTdq?MJRAfld^ijgHlt%~oJ+0AU_0UGeus5u) zCLI@8Jn*=vN%ygriM<3; z8t+aMJ5oZZj@Uw`5umrKB!k8K!TgjXV#$r)|GnslsA5Nn&iOGBVqa?@hh+oOWhKXM z4Eya9tk?ZP@zsQ%Jt)M`j@A(~QmlSh!y+cuk%m~P>FQ8Rs3RXM%3Q&gfJ`XYNMR){ z7xx_C@5cC1TSC;?>l8njg;v11l3iei z-ik1^WF0Pl6~GfRcnP5-QgQJLmVOsMzn{b4r3y{#mY|3DD>s?Tu=fX1UwHUgxI}9# z$z7!$#u=97I>5Z8tAs*>wOkc0Yo{B!OGt_!IO}Ey|BWbTYcTI#1{ze*GgiGPBF-7! z2dVl8{@4uY&QL*RNhsgfezFB8zVQEhgHd;(6sxE?dD7vu**!@I%#Ib89l>5@HnM7S z;;|%+WADD@-_AjNnGsh`FGgJbF*f3E!Lblmg>6$VruoJ|Tve>dG2~pmnhK(Shu3X1kFlY6u62{}QD z&BEHI@?!Lef5gV2cC$m-tW;i$HyrAr#-2{ENU70IuwKWGmH#fzpYK71&O}n8p0QXJ z>C-e^J4CNNu0`5Wv>U@H^5wL=^wNj_Wpq^47MCgy6EB66QY2#p`}Cix*Zzp&HN9 zzZQi;qXk4*CC7$d^kRy+do6~%-LFiEDgSOD+O%+QhBcQ=`RC*Q=TY1jaWwCx=%|Pc zA*tox;&^{)7;mf8SXhger@)!=mw6Iy_b0=cpHC>)dWP^qOMWwkFHVuJ^3i-X0pQs4 zP@TIIz93xW5pd`JcRtL)k_zf6txxHeYPS2r2IcNvG}vEq);nq+w~;ou)$5GT#Fp5q zi+Q+n8SaYy38Al5)&7HfC;T5qpTbXR775@QGA_11Z(!#PJU-C2@Cgj}&Fn=f%l zhwZb}*Z6_iwx|+kZLJ-^PG3qXA;}%Dqr6`8wl@lUmJJcRGZPM!MMSmnT(IaS5%|+M zXOFSyJl*JV;*VmSsIFl* zkj`zJLo`vk`TV@!LwZW(0?*T2--n?YSyO^=aL}MfZkMD-s+c`%A`OK{O+n{o z$`m?W6JCY?ki(RP6&1d_4E;r6_{zpe)Gv)zKWz?seYR;4s@XWEv=J}Fr`O(?WY_G8 zJw-BXW5MvMLXKQTa$Z$?XgcP1Svu)a9>Yqct4-B&le3jaxzx{ zXN%9{+4tt~EUSh*zvh6Aaxt$9HYb@5&Iupjp(2vlNOqU>5pQlBY;J7T5#V?{QMbda z&g=8U>|Pb@Ucs>-9*mwl_=j`Yy^9g_G*18dMBtU=d7vLQ`Z6E7l>>!lLusG2nuv8#GT=T} zOXC7vO~FpNy85Ew)|?{%?x#*CYyl`e@-fa&$M0s0#RYwwl*I^u6s(Po^dF6~cyV=w zU3O{*M4SL1Wy`^IO8R2+?fY`TQB{VRrrz1D*i2F#2#rr2DJW;z?GpX)xDxKfq+F1D zKon#tr&%{+=`Y8)fLp!SD{ln}JnmJflG?5H_pOtX-(kk2b4a-s)6eH1SF)K9gG&}Q zduVgts(|KT-^?;UiEHW4pPoQZa?F--M6`R??B3bpBnDBj9OZB`Y6noLb8I3j zgKEBbl_GXh>5|EtD5sB5T5Im{q+iXEt~JgRUy8&sAC+3+{n(UXPb^%_2PyfsY{@qv z6-BS&s_DH`(lVLgn1oCqH*#_lYee|x)j9=mEFeE$axAc}|r-RJJ!j)DRK6(t3$CD{NHY^2?d1O?O} z{s0t!DCwb~L^MEvz(NQ?yaNKnpI|(G9FINU_*~yjs?}ZloEiK3W;{N&$CPR3=<<~U z$Ew_a_=$0GBl@^8wg>ksIV~$csUJ~QlMKikPtmx>fV>JGE?(_#S%WQAPpHk>mO*qR zni;gSE@wu6BL>Zv^pLX&?xEnLalq|~)U39D>7+F+y4mXMrVvP`=98Iv3X{U)29bOX z%p-R~-1L4;4Rty-AIlon$)&|KoXe-rT7gUD=dAxAmsYeDd85E1G>GGidd4KD8)&Hc zXJlssAuSex$E0qOMa`*D6Gu1`x+9^y0ML`V71>TyKR@e_OJ@3{zt+ldu$9WSRw+3kZrZ7#i1YA>43t*fZ zV!SDYwXXKc(X|29By-pFB1!S9k7tKjNMYh%BP3_Er-RcS9==2gSdM7>#=%AYczkki z^`B86G2Sy^tlp^Ea8*f&l^F66p@$0zHg$#bsZF%3(6m+TMUl1xGKzRM@)<^vjQO6@ z9xw9&{dgN38pU5Tx+3SahLw~cv7vL|Q8aL3 zIadCOAfP)fcKpTj`gsaY`lX$-NDFqRcl^NU7`c?4c4=#XfvV@TqNrM$wZ;gvsq78Z zE41za-N)H+Z82V)QSB!93xiI1tR5z4fQAtC2vRFD`smKWUaBE2I4ji###oGlA?s2Q z)MMp-Ay6+JVv0T3fj)cN2s)ij@L}?*oWXyK(4XzirW1D{m37R0rTE4`$E=)7z~y;S zY!33)XDvctG0-32M>NeON3v&um(OZoro7VMEZtk=QY9K%n|jF;#|xS!fhqfUkUWdU zim$P0kfTYSk>3ul_4jZOaSE*@X@)}0uP}6>3TDj9{q5P9F@wInVR~Rk$^_sey;Zpf z{wK(mJxT%V65ldglINAo_EZ6dqH4>_TyJ$Q5CWM3>Z?jguHXM->E$zxmbCAP4|l~3*#S4xE>PqI?Ysl;OJP)K@~8*7>@?niEjbX_1aJlX!&~)rpSFf`oo6(CO(RHhMHKgRc!>PY z$1TR_vZ2q>t5@nsn#33vc$rv?$E@SNR_Npy*@SMFvI>*;ki=wOWt9C9vx2@8J+cKW zxM_{3`oq4y+>{ZyDv}MMMDrG|2(#d*PxFmfkgMIzfZ=ed_X`yRZa@Ys_STaJ90OAB zBFeqRH{!VSNW9>hf+uA2)1~T{fMurh{Gd-iqMGG0Le3 zX_{>fQ1qMCm7_N?K(j@%ni8NuvZmF%Go}}8h)Pisq^wl4n^0^`-CDNfn_dOfkzYJY z^F>8ri0FWdO(=a+q9c3!kII&Qz);aKoBMz*#nX=FY~5weK`Z~R54jU;oP-<+QY_LC z5(h3-OL}~aZiXdA>05=R2^}&DMyI+6@)wBCdNJcoJ|f_zxp48jiuaJ;d*A4g;++~~ zMU_gd#4e)W9cEgaeF{}75R39k)h%p|L8LEie#TlYJO7H&q^zDU9ebf$g=QY~ASB;D zUnFyp62(xW8=w^)TGzOiJy80{qFn4G>p;#URfd7pMIB#3_LxQoUO7ErXTy{ z=nkK~a@-u&-m-B^F9hb7$}52qwM-oiIqF~crcf)4aTFc8gE7dNhi7=&u9+KFtsAlU zHIZiaY;?;qq0;9FvH6rH^e%VL@uABgcX%KtidgRnT{3^CXwp4FDr{r zuA71tP^=rLXj$an{0LXBHe5qEP1n1#$pykGB=KsQh9kHPGXf*(QGOd(w#0>4=)Tg$*P_-tSXmXA(a;vigjS+p^4+xG-g~XZ$|#Y zz{)4S%v&0<;%4tZb8t$yHK4LwVE$?YDn*(^Dk@?C6{^r#zYD20fn{ChoNI@`QdH}} z%8yv-28x(_vCQTECB|xR#<50FnH<8^7fzF9g`WkdkQa1`-O;EZ-)}&siDdWBzYkeVvy6@Ee5Ne70Upmek)oEDRT`YgfM?9=}NERp~Ox zmchBi?+CwAj)q`aC%b_@ik3JIdV|=1sDM8&KD~j~6|N&x?nRfbl=s$2`v4i{Wtlqo z45B-w6#oy6LYEuRaHQvsYc5nG8cH;G>WUJIAzgGxNvBTAt>fe@CL>21*IeZeMD>}$ Isu&ah2SePWi2wiq literal 0 HcmV?d00001 diff --git a/local_database/classes/DockerClient.py b/local_database/classes/DockerClient.py index bfcc49df..ca9d535b 100644 --- a/local_database/classes/DockerClient.py +++ b/local_database/classes/DockerClient.py @@ -13,7 +13,7 @@ def run_command(self, command: str, container_id: str): exec_id = self.client.api.exec_create( container_id, cmd=command, - tty=True, + tty=False, stdin=False ) output_stream = self.client.api.exec_start(exec_id=exec_id, stream=True) diff --git a/local_database/create_database.py b/local_database/create_database.py index b23cc6d2..ea345fe0 100644 --- a/local_database/create_database.py +++ b/local_database/create_database.py @@ -1,8 +1,13 @@ +import argparse import os +import subprocess + import psycopg2 from psycopg2 import sql -from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME, LOCAL_SOURCE_COLLECTOR_DB_NAME +from local_database.DockerInfos import get_data_sources_data_dumper_info +from local_database.classes.DockerManager import DockerManager +from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME, LOCAL_SOURCE_COLLECTOR_DB_NAME, RESTORE_SH_DOCKER_PATH # Defaults (can be overridden via environment variables) POSTGRES_HOST = os.getenv("POSTGRES_HOST", "host.docker.internal") @@ -45,17 +50,6 @@ def create_database(db_name): except Exception as e: print(f"❌ Failed to create {db_name}: {e}") -def create_database_tables(): - conn = connect(LOCAL_DATA_SOURCES_DB_NAME) - with conn.cursor() as cur: - cur.execute(""" - CREATE TABLE IF NOT EXISTS test_table ( - id SERIAL PRIMARY KEY, - name VARCHAR(255) NOT NULL - ) - """) - conn.commit() - def main(): print("Creating databases...") create_database(LOCAL_DATA_SOURCES_DB_NAME) @@ -63,3 +57,42 @@ def main(): if __name__ == "__main__": main() + parser = argparse.ArgumentParser() + + parser.add_argument( + "--use-shell", + action="store_true", + help="Use shell to run restore script" + ) + + args = parser.parse_args() + + if args.use_shell: + subprocess.run( + [ + "bash", + "-c", + RESTORE_SH_DOCKER_PATH + ], + env={ + "RESTORE_HOST": POSTGRES_HOST, + "RESTORE_USER": POSTGRES_USER, + "RESTORE_PORT": POSTGRES_PORT, + "RESTORE_DB_NAME": LOCAL_DATA_SOURCES_DB_NAME, + "RESTORE_PASSWORD": POSTGRES_PASSWORD + } + ) + os.system(RESTORE_SH_DOCKER_PATH) + exit(0) + + docker_manager = DockerManager() + data_sources_docker_info = get_data_sources_data_dumper_info() + container = docker_manager.run_container( + data_sources_docker_info, + force_rebuild=True + ) + try: + container.run_command(RESTORE_SH_DOCKER_PATH) + finally: + container.stop() + diff --git a/local_database/setup.py b/local_database/setup.py index a720ebc2..99ff1da9 100644 --- a/local_database/setup.py +++ b/local_database/setup.py @@ -29,8 +29,9 @@ def wait_for_postgres(container_id): run_command(f"docker exec {container_id} pg_isready -U postgres", check=True) print("Postgres is ready!") return - except subprocess.CalledProcessError: - print(f"Still waiting... ({i+1}/{MAX_RETRIES})") + except subprocess.CalledProcessError as e: + print(f"Still waiting... ({i + 1}/{MAX_RETRIES}) Exit code: {e.returncode}") + print(f"Output: {e.output if hasattr(e, 'output') else 'N/A'}") time.sleep(SLEEP_SECONDS) print("Postgres did not become ready in time.") sys.exit(1) From e3c00918c6ae571b64c144cde786399114f9bd0d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 15:08:39 -0400 Subject: [PATCH 136/182] DRAFT --- .github/workflows/test_app.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 1dfdd466..8730d331 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -48,6 +48,17 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt python -m local_database.create_database --use-shell + env: + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + POSTGRES_DB: source_collector_test_db + POSTGRES_HOST: postgres + POSTGRES_PORT: 5432 + DATA_SOURCES_HOST: postgres + DATA_SOURCES_PORT: 5432 + DATA_SOURCES_USER: postgres + DATA_SOURCES_PASSWORD: postgres + DATA_SOURCES_DB: test_data_sources_db - name: Run tests run: | pytest tests/test_automated From 27ef0068d0f40089be6bd6b1e738c87ca53a1b6a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 15:13:28 -0400 Subject: [PATCH 137/182] DRAFT --- .github/workflows/test_app.yml | 56 ++++++++++------------------------ 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 8730d331..c3c54b83 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -1,21 +1,6 @@ -# This workflow will test the Source Collector App -# Utilizing the docker-compose file in the root directory name: Test Source Collector App -on: pull_request -#jobs: -# build: -# runs-on: ubuntu-latest -# steps: -# - name: Checkout repository -# uses: actions/checkout@v4 -# - name: Run docker-compose -# uses: hoverkraft-tech/compose-action@v2.0.1 -# with: -# compose-file: "docker-compose.yml" -# - name: Execute tests in the running service -# run: | -# docker ps -a && docker exec data-source-identification-app-1 pytest /app/tests/test_automated +on: pull_request jobs: container-job: @@ -34,6 +19,20 @@ jobs: --health-timeout 5s --health-retries 5 + env: # <-- Consolidated env block here + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + POSTGRES_DB: source_collector_test_db + POSTGRES_HOST: postgres + POSTGRES_PORT: 5432 + DATA_SOURCES_HOST: postgres + DATA_SOURCES_PORT: 5432 + DATA_SOURCES_USER: postgres + DATA_SOURCES_PASSWORD: postgres + DATA_SOURCES_DB: test_data_sources_db + GOOGLE_API_KEY: TEST + GOOGLE_CSE_ID: TEST + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -48,31 +47,8 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt python -m local_database.create_database --use-shell - env: - POSTGRES_PASSWORD: postgres - POSTGRES_USER: postgres - POSTGRES_DB: source_collector_test_db - POSTGRES_HOST: postgres - POSTGRES_PORT: 5432 - DATA_SOURCES_HOST: postgres - DATA_SOURCES_PORT: 5432 - DATA_SOURCES_USER: postgres - DATA_SOURCES_PASSWORD: postgres - DATA_SOURCES_DB: test_data_sources_db + - name: Run tests run: | pytest tests/test_automated pytest tests/test_alembic - env: - POSTGRES_PASSWORD: postgres - POSTGRES_USER: postgres - POSTGRES_DB: source_collector_test_db - POSTGRES_HOST: postgres - POSTGRES_PORT: 5432 - DATA_SOURCES_HOST: postgres - DATA_SOURCES_PORT: 5432 - DATA_SOURCES_USER: postgres - DATA_SOURCES_PASSWORD: postgres - DATA_SOURCES_DB: test_data_sources_db - GOOGLE_API_KEY: TEST - GOOGLE_CSE_ID: TEST From f4d41345806031dc0775fe2f64ea30620bd93f51 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 15:16:16 -0400 Subject: [PATCH 138/182] DRAFT --- local_database/create_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/local_database/create_database.py b/local_database/create_database.py index ea345fe0..58b15508 100644 --- a/local_database/create_database.py +++ b/local_database/create_database.py @@ -77,7 +77,7 @@ def main(): env={ "RESTORE_HOST": POSTGRES_HOST, "RESTORE_USER": POSTGRES_USER, - "RESTORE_PORT": POSTGRES_PORT, + "RESTORE_PORT": str(POSTGRES_PORT), "RESTORE_DB_NAME": LOCAL_DATA_SOURCES_DB_NAME, "RESTORE_PASSWORD": POSTGRES_PASSWORD } From 861ea7198147ee38973f8059ca433b4e8f7c2dbe Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 16:34:20 -0400 Subject: [PATCH 139/182] feat(database): begin setting up FDW - initial link --- ENV.md | 20 ++++++++++++++----- ...3f1272f94b9_set_up_foreign_data_wrapper.py | 10 +++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/ENV.md b/ENV.md index f145e20e..fdd7d029 100644 --- a/ENV.md +++ b/ENV.md @@ -23,12 +23,22 @@ Please ensure these are properly defined in a `.env` file in the root directory. [^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions. +## Foreign Data Wrapper (FDW) +``` +FDW_DATA_SOURCES_HOST=127.0.0.1 # The host of the Data Sources Database, used for FDW setup +FDW_DATA_SOURCES_PORT=1234 # The port of the Data Sources Database, used for FDW setup +FDW_DATA_SOURCES_USER=fdw_user # The username for the Data Sources Database, used for FDW setup +FDW_DATA_SOURCES_PASSWORD=password # The password for the Data Sources Database, used for FDW setup +FDW_DATA_SOURCES_DB=db_name # The database name for the Data Sources Database, used for FDW setup + +``` + ## Data Dumper ``` -PROD_DATA_SOURCES_HOST=127.0.0.1 # The host of the production Data Sources Database -PROD_DATA_SOURCES_PORT=1234 # The port of the production Data Sources Database -PROD_DATA_SOURCES_USER=dump_user # The username for the production Data Sources Database -PROD_DATA_SOURCES_PASSWORD=password # The password for the production Data Sources Database -PROD_DATA_SOURCES_DB=db_name # The database name for the production Data Sources Database +PROD_DATA_SOURCES_HOST=127.0.0.1 # The host of the production Data Sources Database, used for Data Dumper +PROD_DATA_SOURCES_PORT=1234 # The port of the production Data Sources Database, used for Data Dumper +PROD_DATA_SOURCES_USER=dump_user # The username for the production Data Sources Database, used for Data Dumper +PROD_DATA_SOURCES_PASSWORD=password # The password for the production Data Sources Database, used for Data Dumper +PROD_DATA_SOURCES_DB=db_name # The database name for the production Data Sources Database, used for Data Dumper ``` \ No newline at end of file diff --git a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py index 5c1adf18..1b73f5f4 100644 --- a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py +++ b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py @@ -21,11 +21,11 @@ def upgrade() -> None: load_dotenv() - remote_host = os.getenv("DATA_SOURCES_HOST") - user = os.getenv("DATA_SOURCES_USER") - password = os.getenv("DATA_SOURCES_PASSWORD") - db_name = os.getenv("DATA_SOURCES_DB") - port = os.getenv("DATA_SOURCES_PORT") + remote_host = os.getenv("FDW_DATA_SOURCES_HOST") + user = os.getenv("FDW_DATA_SOURCES_USER") + password = os.getenv("FDW_DATA_SOURCES_PASSWORD") + db_name = os.getenv("FDW_DATA_SOURCES_DB") + port = os.getenv("FDW_DATA_SOURCES_PORT") op.execute(f"CREATE EXTENSION IF NOT EXISTS postgres_fdw;") From 8e47a33fdd9b9c20cbca934f21b59bf8874567e2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 16:36:17 -0400 Subject: [PATCH 140/182] feat(database): begin setting up FDW - initial link --- .github/workflows/test_app.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index c3c54b83..c869304a 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -30,6 +30,11 @@ jobs: DATA_SOURCES_USER: postgres DATA_SOURCES_PASSWORD: postgres DATA_SOURCES_DB: test_data_sources_db + FDW_DATA_SOURCES_HOST: postgres + FDW_DATA_SOURCES_PORT: 5432 + FDW_DATA_SOURCES_USER: postgres + FDW_DATA_SOURCES_PASSWORD: postgres + FDW_DATA_SOURCES_DB: test_data_sources_db GOOGLE_API_KEY: TEST GOOGLE_CSE_ID: TEST From f12ef61dafbc74c018d935f67dadc853d8e8afb2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 16:52:19 -0400 Subject: [PATCH 141/182] feat(database): begin setting up FDW - initial link --- .../2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py index 1b73f5f4..fc90db70 100644 --- a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py +++ b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py @@ -36,7 +36,7 @@ def upgrade() -> None: """) op.execute(f""" - CREATE USER MAPPING FOR {user} + CREATE USER MAPPING FOR PUBLIC SERVER data_sources_server OPTIONS (user '{user}', password '{password}'); """) From af7f0e05b5d7395888e4bf3061a76f238dded085 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 16:57:31 -0400 Subject: [PATCH 142/182] feat(database): begin setting up FDW - initial link --- .../2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py index fc90db70..737b49a0 100644 --- a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py +++ b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py @@ -236,7 +236,7 @@ def downgrade() -> None: # Drop user mapping user = os.getenv("DATA_SOURCES_USER") op.execute(f""" - DROP USER MAPPING FOR {user} SERVER data_sources_server; + DROP USER MAPPING FOR PUBLIC SERVER data_sources_server; """) # Drop server From b8d33223d880c2d92d4e343ef70b6830ad590978 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 17:38:10 -0400 Subject: [PATCH 143/182] fix(remove FDW setup): --- ...3f1272f94b9_set_up_foreign_data_wrapper.py | 250 ------------------ 1 file changed, 250 deletions(-) delete mode 100644 alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py diff --git a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py b/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py deleted file mode 100644 index 737b49a0..00000000 --- a/alembic/versions/2025_04_21_1817-13f1272f94b9_set_up_foreign_data_wrapper.py +++ /dev/null @@ -1,250 +0,0 @@ -"""Set up foreign data wrapper - -Revision ID: 13f1272f94b9 -Revises: e285e6e7cf71 -Create Date: 2025-04-21 18:17:34.593973 - -""" -import os -from typing import Sequence, Union - -from alembic import op -from dotenv import load_dotenv - -# revision identifiers, used by Alembic. -revision: str = '13f1272f94b9' -down_revision: Union[str, None] = 'e285e6e7cf71' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - - load_dotenv() - remote_host = os.getenv("FDW_DATA_SOURCES_HOST") - user = os.getenv("FDW_DATA_SOURCES_USER") - password = os.getenv("FDW_DATA_SOURCES_PASSWORD") - db_name = os.getenv("FDW_DATA_SOURCES_DB") - port = os.getenv("FDW_DATA_SOURCES_PORT") - - op.execute(f"CREATE EXTENSION IF NOT EXISTS postgres_fdw;") - - op.execute(f""" - CREATE SERVER data_sources_server - FOREIGN DATA WRAPPER postgres_fdw - OPTIONS (host '{remote_host}', dbname '{db_name}', port '{port}'); - """) - - op.execute(f""" - CREATE USER MAPPING FOR PUBLIC - SERVER data_sources_server - OPTIONS (user '{user}', password '{password}'); - """) - - op.execute('CREATE SCHEMA if not exists "remote";') - - # Users table - op.execute(""" - CREATE FOREIGN TABLE IF NOT EXISTS "remote".users - ( - id bigint, - created_at timestamp with time zone, - updated_at timestamp with time zone, - email text, - password_digest text, - api_key character varying, - role text - ) - SERVER data_sources_server - OPTIONS ( - schema_name 'public', - table_name 'users' - ); - """) - - # Agencies - # -Enums - # --Jurisdiction Type - op.execute(""" - CREATE TYPE jurisdiction_type AS ENUM - ('school', 'county', 'local', 'port', 'tribal', 'transit', 'state', 'federal'); - """) - # --Agency Type - op.execute(""" - CREATE TYPE agency_type AS ENUM - ('incarceration', 'law enforcement', 'aggregated', 'court', 'unknown'); - """) - - # -Table - op.execute(""" - CREATE FOREIGN TABLE IF NOT EXISTS "remote".agencies - ( - name character , - homepage_url character , - jurisdiction_type jurisdiction_type , - lat double precision, - lng double precision, - defunct_year character , - airtable_uid character , - agency_type agency_type , - multi_agency boolean , - no_web_presence boolean , - airtable_agency_last_modified timestamp with time zone, - rejection_reason character , - last_approval_editor character , - submitter_contact character, - agency_created timestamp with time zone, - id integer, - approval_status text, - creator_user_id integer - ) - SERVER data_sources_server - OPTIONS ( - schema_name 'public', - table_name 'agencies' - ); - """) - - # Locations Table - # -Enums - # --Location Type - op.execute(""" - CREATE TYPE location_type AS ENUM - ('State', 'County', 'Locality'); - """) - - # -Table - op.execute(""" - CREATE FOREIGN TABLE IF NOT EXISTS "remote".locations - ( - id bigint, - type location_type, - state_id bigint, - county_id bigint, - locality_id bigint - ) - SERVER data_sources_server - OPTIONS ( - schema_name 'public', - table_name 'locations' - ); - """) - - # Data Sources Table - - # -Enums - # -- access_type - op.execute(""" - CREATE TYPE access_type AS ENUM - ('Download', 'Webpage', 'API'); - """) - - # -- agency_aggregation - op.execute(""" - CREATE TYPE agency_aggregation AS ENUM - ('county', 'local', 'state', 'federal'); - """) - # -- update_method - op.execute(""" - CREATE TYPE update_method AS ENUM - ('Insert', 'No updates', 'Overwrite'); - """) - - # -- detail_level - op.execute(""" - CREATE TYPE detail_level AS ENUM - ('Individual record', 'Aggregated records', 'Summarized totals'); - """) - - # -- retention_schedule - op.execute(""" - CREATE TYPE retention_schedule AS ENUM - ('< 1 day', '1 day', '< 1 week', '1 week', '1 month', '< 1 year', '1-10 years', '> 10 years', 'Future only'); - """) - - # -Table - op.execute(""" - CREATE FOREIGN TABLE IF NOT EXISTS "remote".data_sources - ( - name character varying , - description character , - source_url character , - agency_supplied boolean, - supplying_entity character , - agency_originated boolean, - agency_aggregation agency_aggregation, - coverage_start date, - coverage_end date, - updated_at timestamp with time zone , - detail_level detail_level, - record_download_option_provided boolean, - data_portal_type character , - update_method update_method, - readme_url character , - originating_entity character , - retention_schedule retention_schedule, - airtable_uid character , - scraper_url character , - created_at timestamp with time zone , - submission_notes character , - rejection_note character , - submitter_contact_info character , - agency_described_not_in_database character , - data_portal_type_other character , - data_source_request character , - broken_source_url_as_of timestamp with time zone, - access_notes text , - url_status text , - approval_status text , - record_type_id integer, - access_types access_type[], - tags text[] , - record_formats text[] , - id integer, - approval_status_updated_at timestamp with time zone , - last_approval_editor bigint - ) - SERVER data_sources_server - OPTIONS ( - schema_name 'public', - table_name 'data_sources' - ); - """) - - - -def downgrade() -> None: - # Drop foreign schema - op.execute('DROP SCHEMA IF EXISTS "remote" CASCADE;') - - # Drop enums - enums = [ - "jurisdiction_type", - "agency_type", - "location_type", - "access_type", - "agency_aggregation", - "update_method", - "detail_level", - "retention_schedule", - ] - for enum in enums: - op.execute(f""" - DROP TYPE IF EXISTS {enum}; - """) - - # Drop user mapping - user = os.getenv("DATA_SOURCES_USER") - op.execute(f""" - DROP USER MAPPING FOR PUBLIC SERVER data_sources_server; - """) - - # Drop server - op.execute(""" - DROP SERVER IF EXISTS data_sources_server CASCADE; - """) - - # Drop FDW - op.execute(""" - DROP EXTENSION IF EXISTS postgres_fdw CASCADE; - """) From 8e013bb75f3e5fab40ab9b2a634d71e4cd055661 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 17:39:15 -0400 Subject: [PATCH 144/182] fix(database): Remove FDW setup and tests --- .github/workflows/test_app.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index c869304a..ab1edff9 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -25,16 +25,6 @@ jobs: POSTGRES_DB: source_collector_test_db POSTGRES_HOST: postgres POSTGRES_PORT: 5432 - DATA_SOURCES_HOST: postgres - DATA_SOURCES_PORT: 5432 - DATA_SOURCES_USER: postgres - DATA_SOURCES_PASSWORD: postgres - DATA_SOURCES_DB: test_data_sources_db - FDW_DATA_SOURCES_HOST: postgres - FDW_DATA_SOURCES_PORT: 5432 - FDW_DATA_SOURCES_USER: postgres - FDW_DATA_SOURCES_PASSWORD: postgres - FDW_DATA_SOURCES_DB: test_data_sources_db GOOGLE_API_KEY: TEST GOOGLE_CSE_ID: TEST @@ -42,16 +32,10 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Install PostgreSQL client tools - run: | - apt-get update - apt-get install -y postgresql-client - - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - python -m local_database.create_database --use-shell - name: Run tests run: | From fac193107618b6b76ecca24e991dbb4c3e7a2ac8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 17:42:08 -0400 Subject: [PATCH 145/182] fix(database): Remove FDW setup and tests --- .github/workflows/test_app.yml | 2 +- .../DataDumper/dump/data_sources_db_dump.sql | Bin 183850 -> 0 bytes local_database/DockerInfos.py | 38 --------------- local_database/constants.py | 1 - local_database/create_database.py | 44 +----------------- local_database/dump_data_sources_schema.py | 21 --------- 6 files changed, 2 insertions(+), 104 deletions(-) delete mode 100644 local_database/DataDumper/dump/data_sources_db_dump.sql delete mode 100644 local_database/dump_data_sources_schema.py diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index ab1edff9..73bc5738 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -19,7 +19,7 @@ jobs: --health-timeout 5s --health-retries 5 - env: # <-- Consolidated env block here + env: POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres POSTGRES_DB: source_collector_test_db diff --git a/local_database/DataDumper/dump/data_sources_db_dump.sql b/local_database/DataDumper/dump/data_sources_db_dump.sql deleted file mode 100644 index aa27b60a2bd866ef4228dfc2af49a72617b141a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 183850 zcmeFa37BNrRUQ~ZLZAkTMF_D8UbkAREU7w`5xJMtg6zzw>XfoFOPN_+Y6%gFihP+F zs>p~`ELELC0)fC__FXUxi($an*y90@hw)%+gIUG{9%jtgX7d6zn0? z=|AtG|M1U8;J@cb&GW{2zjxHQS8E*IseCY=RhQ?hwMwFXOXjxHJMCsS;pdN4=g&`$ zPO7zqS644reuLWjrWfNs{6l{&RK$Nj6u;MZH}|%-cdu8hKR$@Q{bc*@2K59Nghuw-8#IscImYbF&)wWa8Hk$ok7Z9aT&YQ#2OIMOh_&Azpp4d33SGe1si6@);^^L=NqFyA9#3i@4 z51&j9>o;!h&}WY)!}RRD(;TJ?iIN^I6Uolr=EhFFadUs~sqL-$K3;U%cUtl>eU^3` zw+`kZK#!3GRX&A!w7+*V+IkqB%7>ubQLlLf-bdvulQ_a>aba~`;8UwDtgmnl;MA~x za#nc(-q+uJSl>O^-rE(xycUlnF#1iheJ$DDJ51_t-aa@yAU;!RfzQF_ll2=L$@%C` zr`-Z!ABXZxSXNqoAErp?1RC6-A@jF1tIYxYQD(PQcbKrvy&E^`yN5i`D=Cp89wd9a z=)ELVl}NS^l1sz%nPD<$pS3&9etUS4oTbgdsGpu;=qDI$NxOTLK9dL;CM|HMIY^I^ zUN^zGJLxyi26(jK0U?g-+v7IcWyg{P+OB1+xL%2I{o z5}Q!x(N3?`?5IC5lMgkokJF>Hk9W-PSD-bY z-(Fi&1M|HxU~0A16(Q}S(E!&!KNk@y;6l;vwqOK+O2Sux&}rUJQZi&(tZvm~#femp z4FK66s@J3L-7dyX-ss3WJ~d1yF_@~$>q~5FG|yqA-fMPXi8Y6#fg?Qcia=!-a^Vtn zW&rflcciUh_V;<(C6iP^(oNH&LDD|LoYg*V!v-UeS(+XRN|W9A)EJ#dFNV)EZp)m@$$ZWy@P{k2u!spBu!E)R#ux2v9- z{#VE7uP#)@0#8^CA>zLVPL;qQG77kqZ?y+-d9_Y4HIa=%PC7;SM!$~~1GV4Xu* z+#}JGi_hz6PcEm`lg)nnEKP1SY>jlJP%+E*WLbh~7Ty)wDc>kdyl7w*UVsZfNv z9~zgPUWUj4d@Os%$1rQtK595=!>y@%hZpE{KiTfy>vitIggby@8k&G^!MeQAR3&E3 zf)E;psI%SD-4I+EpfR?VQ2B9B(ZndVQzCs6RP~#!4B&%d?_5`MBRy(2lP5+U7#Hn~ zafkh8yGzV@W%6X3+_QG=X@Ah`cGC{pCdtsXQCnz=KSt3FUdALhU^}ahK_b}7cR}84 zb~=LuE2-ol?cW1fs=!7oT;SFqO?t=4HClUlm65Ro0$7SjYI@zHQA;cFr&4%bI%#rq z)E|u688A?8{b8~R_n0Q=VAQ{tUg$rswbRZKBWePAyV=FcCb`L|=x=wxE$X2$3F~*8 z&3dbYb>n8n@U<~;IHC?pyFV$lVx1pu?<1EYcLU`RsZGM zvPyvO!>@AGZ(?PvJW9gz3q)q#gg`3 zKSzz*_)N`^Zx6Z{=^=ZO?ACm*@RyM$&wPxg=gBV0?%{`^`=G3`Tp1)#d&yB7TB?sf zX*1>oDgt8;zhU8z-c&(lk?Mx$hFgsG-G zgmM&hz*RvmOi{`GUjHur))kd#!E&lWzfd~uCW8_CKaZjAE|Qj9QXR3E9M6v1(1Nl* z`x1ssCS!H#FNi9j)=E)HatBK}KqCoB`=W!UJSu^9>IGPE&7bxOeJQ`Rj6#3W zJ+_gl_GWTFy_3jFB!+5CI8|!$X%~wI%@u6z+)V)i>Dx}Pd-B*Y?VmvkdVNR)(Q!N) zVhaJ?XHjXTe)lD17&cC8K?07US;2aU3Yt7)BY~QM7KOQx9QS)?A-MFQ#^i@V?Kjj{ zYvwzUJ6je^sCnr7rG1V+x(aA2hwHu0vWCYv&)bwok_JV)II` zy0Eguo^Fh3`Sb3Wx~{IQtyf-xuO#-3QD688Ltd%(pq}^c8MG+)wYE}C`1Xb-PpaYhwwczkIZXJ7f z1sd0?A*gue7bHt0XD1kmT_r_QuS(z{syone_zX&z26}o|3_zC^&WC4Yz=|&o?%<=1 zym16)YJyP z^#{Epo~dEr{>6MyFX>}KU;SNuth&Gtw6Xa5#C>v2Xy;zHBbVO6Ef=5osYlbFXe~w1 zS8Hm6k>VDX+4~?vC17wsi8{Wlw}ipAinur*!{KlQ4K(AnVuE zAUr|gE&L-?1DxC((p0Nf*K4edVXKgDE^Rc=+l{;F#oT3%4Y_t}ck@sO@4OFHNV7$~ zP@xEZ`DDL-cx!+60C7#v3^@p4XJhyJt&QslU+A2l4Bpw{hlpEv^wEc&s9)dS<-bL# z1obuG-GzDA7>(LTbBI-_?{2;Jp+_G@(1Lc57x!Xn=&nbhTdFN^I1)TFka@(T(RO@8 z`BZ}dzKTp05f(tdd)AA1aDF?9`AN*5M}>KWTOw26L6I*e&mNTv*Z zB&Z`^g+`*D7S0N^f3hC7}h(B3;nHZ_H9Qmyl$gl1L{X&*7sW}l^Me2E?c&m-51Y?C- zp>RAY!5Hlg>2qk%%0H;EejMNDD=fd7aphO*epAnoQ>7tY`&KuTIpY=NoRy8zHKtB+ zSR!VQg#wQF)J)BguZ}UKwp6PkXk0TyT9ejJcSGz$W0}@gkvBm;tz%x#=Tqt0$7LVO zzvf{=KNQak?XH{#(s-ujCVYFz;gfYS)$_ObkI5kWi7BidPR z`z0IrD%4UJ(~kB}p?R33SFhULL8!R)NV@MXiy4fX-P=!?K>D42M0ln6JxO$jhbgWVly0&+xq|0jLaz>fQ5^pFf@RS(&a+%C!F%ws?{LW%a)F@S` zC2F`@-J!Re8wX+@)1S&;Fq`a#I1O1S;4_OGeI-2S`s)KJ6K@Po$El{uBqMpZ^simT(CKu}VHI*#+|Rp)J{z=+9EVs<0+(9>O7 zc<3$tC@tjFok#`dJ}zB8x97T+%l<^+X(h=UB@Dhu29%O4qs%@*YNB=0r7hadIpA=w zEO_tgRd=+CY79G2RW#nvyEe|ylU?`8oWaPu78`lhrRo|-4$6@ymJXT48t@am5Ow8S zBW@+*j-<@qC|*WD)5G6nt|7Sf2^J8}!d6u5h=8eSS*P1~&Kh;UiLIGa->rWYNc|Th;74yuAh$c@JybL@sz= zmGGKYP?k1Zr*lNfWgNKMyQ%BvT49&UYCrAug3`vFQM+^0=-r{9`nh}a@TOh4H;?k8 z)K<$>o-8j_!Q<+e#}Y1O^)N~5I|o@~pDI2PkhuoHmB! zu5Gli>xa0E;3BMrbD$Bm7pOpw2Ri9j!m&qisULq-h&KpKpuvpze1Ck1FKB_A{{_`C z>^1OVULE)XABMj0MjbZ{HT@*qxd`kMVVQ{zzJw={VjEJN)fu+&57tIWzjq&pCK0WX zAo7gY5~Qs=y}{5QLDs@h#(V~B2L+u05yo5ZKG8~YHivdEU%kpAXBg6|82xLSA3_e` z{X0qKG$|a}c_k5&(ruo>J=#P(4;`&x4=J@xHndL!Rj*b>mn0Ea$V9`TVx0Tk-MCS| z!mCDdv}PojCO=i7e&o;tNdZzaR)|%r#R}TC1kfk!B@{%GL$EVC>kj;G3zd|u%jGh7 zYWNi$=D&vI+Kj_eC}W&iLj6u9rMx~am7SjUGAyr<|67g?SLMl?=wmrFyo$2IWf%k= z9NcZ66VfLXeAPZq?(~MI%v#w3oenm7-o((@y|uGLTlqLBj-$jh114gY8Q61@>@JNA zqXwL9-Nh+uu7pf`LA*>O(EA`8e-Z|*KYTX215A_vqoVK<5mLeD3=^+Y z1XMsMjoP4qmHZ*_#S_6`jgUa223Ks1S5j=^WKsX7mKtiXq^wAwJsm9kE+uqDY55?^ z%pXie|~Ljb+sJUBGj&d24_B@M(<7YumeZVMzJWE!Xzz2Tvw9a87N1dt+z& zE%mMBsqOl0ME?67H)A1I&|MbhF*`f89B<`dUu92{LW=@Inx}~RMz#wiy+ABL82Rrk zfqba02ZFobkcs+IA?kkl88~A%b1+}1P_&qjx$?+HK>x0>KsPFTz^Ln%OxTylfL&KG z1KhWxxG&)_7U^yooY-vl#VVgA9Z^6~p^Y;nayxEg+7s*Lpw;se85>;1Jyz8dn?Gd0 zZfx%hzXF2sb~m4H>}(tyHf|tc-L-9~qsJ3@tfw)=Ax(Iu&(7KS3{SFKid9pQGqsj_ zW0Z4iFjHS{aL&$7(h#VVsS@PZOHgmbr>FRpQ{lv1QXv%X9vhZWTS$_&ji^jvwoxR3 z`;lc*62F}f44T1g%v$?ov>tBXs2?0|+>lOOWss3w=Wt$x(p)WKd=4&Cv-_PfcGniF z)k>1FTZG*j)cP z(!r}muWGgHD>u8~!%{0q9x2iRF3+JLLTHrobIOM5*ChuTe<7B{vY?J}&Hg3_t6u|o zz~IA`PI*VI%CQ*@8jZ7IY8m*{xD2c;a6TN_xFnFat4HBYuq%c9cj}mm_j|NmA^Jwy znJ#5VaHinAppNu#R$XbP%fjT)%t{Nr9OgG!l|e5>81kuk{b>8*CEta&hucdZC-mTjBtsX_l~G%9|i14HJ_ zg_6Z-NXj3qmdu>am92A;Ft-nnh}5V2RJkbB_cF4Jb{i9*K*>aKl>1(@e?j|7YQska z5T4?&Fl7NHuZaJ#_{PhMq#E>J*p(x7adOl{p58P;N`qE=fOvLMWd|JzzrhJ)O0U}B z6RuwGMQOd%!ZDy&r{yY)*5azW<|DhKXE^-W7A^|pEkF*c6uw0`Z8mQ09E#l04s1BN zhvWQ67EXy8uqR!njPAAUhqXXV^ukIynJF30yXeT6>V{{OJ6#eP8|myUJgwNhjse5X zYV#lIWU$j7`*tw817AI)hf-^ErkEH<$h&jVkWSv|i2+}+gODNy!l6=kTq;%R;dCD> z&@Rb1K*V}SzO)5_3F!43%0DkV!3m=%LW;F=aM1bsARF9pfSrdkzgW89X!E_qUypALFL96zb^~)-AXr7v&q2W(z*7IAnrDJv^dm598TgZZgk!?o(l zb7=k|H?jEn+d+ir5`fLUom)3{b?4_vMfy1LDc3Ati!7w%M-t9u87(?v&$?D73J}Aa zwjVRflBb-hpZF_++E=9IfULbX|krvBDGVh}`@1VQ|zO9P}t z5i<)U1erP1;nrDG|967aKOCUmluMCs%}WOU<)t9-D$+RcWCI|~FL)6mMMCWAV}AvQ zfT7vu()p8ld0bMRmAsOijk@ht`@Gq?JZr4}ZV)Sa=t>GEZeKterO=@{A8VRRG%E0W zdpJ6R71HB;bTH9;IZ@8qSyTJn<*DtS_|(3=9JR1xG_^?4@qr`v?*+*XsR7f%^6BOi z?uE+FtV099CL8V0N^!pr6(~OxO5pq)xd16hf?1RQKL*LCb(=e|1!p4auiU(6RH2C%iVtQajLU&OmS~Ul zSrh#af<%V|-n8z0o)1g$+^mDirj+0~p$lb>qw)}X3dYd9?q@07xFAwNlQHlZV+Y%8 z)>!;cK`iL516Kf?g^LfzJ4(UfR8x!y?+JoHz7!V(J%FVwTPrai zlD}CBlH5wOk)rm`)Zn1D5otu*gyO(A3~H??933)i#J)F(SQ0?YU0nJgaaR4pdBd$o z<099rq%-9=m|2Yx#ilfL3#}x`o_t_a$v+HY@@N2)&_a4rTtaneY}7Ri&@s}=W}%Tw z_xqFM8QN;qqn!pmD4$tsM0piyM3gteT%W4x7D-3R`$4d% z3LYdb|0swH-RS0N9m_}JgVHVqid0$3)1r(|!+k`VTQWJIC@vV^2^nLu(5yw{e+feH z(E$iTHrJapR(bdr*JVEksht z$g-bA1sgjE@Jhm77x7L?3~1fh+DZ_JQQt>U*cOFPDu))L$8g&l&UPWHOayPLh7vjv zV!1kDXviLk9jwER{X=mjmx#4Q$iRM`E-#}7#G~E426x~+W2Fy!TI`Q^X)u13Z@#E|4RSv)oFpFg}I!pcz z>YIW~4`u`zU4FK+%<@Orw7|s~3A~*B8Mv_4otgO`2T^*MNzOLyd=?>_ppO)1?#+;rcD1>K7p#cU zx`}&e5Gs)%M+5e0$f=8XBDnPwGZ_#rq$C}EL_5-&2b{-#Z1T+A?~?dAgW{Xt-^lm#xhmr40j2oyY|^!wX_I(5iNy4|z+Iz4h!XtZFvYA~FmS-=BT?gZ%r|OyD(3&Nc09r*;b-lx zaf`h0+PG-{Kv0K}>oX#|Y7EdFlDf}(w|CjaC1M+!cqJrUe^3hTL{(#Xzyh;m`$*C+ z=zTug6AhFDp|-%^Wx+=O9us-tkD8T{Ho9F==DV zN*dkKnIE(_w}^aHDxi)1PA^B-0@kiwwJ#7ELGjQv2Zxu(^kc1tr0C==gw2jTNt7SQ zh2Rp(hL#&yiJ}#T&X>Vk+X_PGsX|7q6S9;%K9CwjxunQ4_Nd|Pd_n0>XbCTJ$%s-a zwS@Uhdf@34yuA>j*gAS*$9&@2@i~X1G`xI{W=KuOOu1mJRvoTB66b1lo$rFpT2`Z* zLYLLx6@!{LP(_Y=^+K`8E6!8Q167-3GTeetm@rzF;082@QdQ<56==x0!M&y-dZ zI8sv9uyXXO@$_N#1y)Q`slatt1^x$gOZ@mMaD_KRI7vDdjqbpIb!_0%#R;?k6QipwI`gh*H zRfm0ZzEj(FSwG>l3hp{kd^0EQSM~#KxlnXVVDL+{d~waVQd*Dyv!JME;l-MjUi9P_ zqqUfOLX~189~YWp-2_ca>qG%}lY@)ZS7uQ0iM}ZIN|NzMgcCDH+IvB4n@vP<&5gPK z=Rr`6h+@_Ac@kBBiTSxUH1^8KDl1k&DZ)c>LF2Z8&`oiVA}xjvhS)G^Vo!JO+V;(Z z%ZRQ-h-KVW!IzLsxqkEz`N7hNgk!9)l|qEaG%c>LQI>c!$eKZ$*7xF3^+Q3b=%Hkp z<`vY0&*pcQB8oH<(X(D;Y5j(YqkIR>o!zI^*;9wW+ZY8Y91;%k-RXxm1pvEeZ|?;Rl-P=^p6JOHY{vy&EOn@ zH-A=Q%jN3gvIrGZJd7`v>3*evJ6IJ=EdQ^!<-M_Ud*kT?x`nVzn3$^6bXm^OM>%Df zmm-zOBzq#l zha_Q+?`e9U(rZD!zpS)+9luXwvQ?*YIJuW;({+gA$7Av0dWFL1N_Hzgz`hfeL_GTA z)&1aIRdsYV=&Bm8qdtpD`mtP(|Njxx^b>F;X9wVOKy%p1H6GEP2drfAsoi7lwb!qS zi}dOur^~f^#z)W1+ua(RgvM1_#)T?YFlL#$iu9GxJ5fdD9FCu*IuhPBa zW=##2vGMT72dcsksi4x1v-uo0*D4ftRE9Niwe{_&p^Vy!$d5V2KT!Aj6X<8L&dTql z&yZ3IYjI)G*}wSlpu!rL89CaDPO_BHR{7-StEx!ZDfQH%mssLIf)w#%PrV4jU$64j z;HCh3zI;l+P-H_Xhkn%QP5DOnlve{9kOI)OKVKPjWX>c1gxnN%o~lz4m$s3uR2E1*G^z6725f`o;W$@~kZ+)Af2$|K+@!jCOhl`}m#&Q0v z59bYC(R~ugJ2Y2YUX~G>T8z-t8B~3U&7(etab_LCB#3;f@^PbsDXOCn#awD5pTcpw ziy*7UsCyT?rd^-)R#8CivV=yFx;MWaJavL%aXwvd?Rw!Cpi1E`Nm{!u9J38 zhNpA<>T>crByF$aeo)$=c^CsI-;y5qx-A=Y@ySNm%L^+@;+HBes^?LFBLO$U~cnd=R4INj!29# z!dG-WMAdPY4x3cTv_1@XqWD)qj9$v5bNH!6X)x&Y%*M&yP_cT^Z^dwS2mOaK2Fq@RvlNIGl+GIj}t;yZNyMtL;c${3rybJ~Z+ zCYYEA=zJ@JaCIgfb+oge9F5q8Ox`UiSmN9#Pb0Gylb;O2L{T9x>()6=i9ci24Vn0KmGQx#8NrPPtF6B*+n@E1fq26NBoHbsgrv7A$xOzulA4IXR{D6>8s+~9f)Q21w(09* zk$YIg78<)Mh5|~H4Qn}1-ML6^Zf)Eoy?l%`6&%)LOiike*pGRp*X^B69n#^d+q z(A0;kyb0#RV&2Q@>O!?bnWS{jX_eSOab2 z=t8m3LL@+%GY)-G`@Cs@b82T4ONM)xayX^6<)QZTLDW{uqh{$G=p`Sv`_sVYR?oN@ z$$8_0an?xv`yf(RCqznUXCJKh%?vE5|4GN)@)CK@iMN;4+QuBJvj*;^(Rp|co5dds zg3YlpXr#}a!#$cFu@m?y+!`oOiaeIxq)HE!8$DbW(LxX5uEL!u z{aAyh3^V;k(Ji+Y)bo?}ygL$OuY_b#APL{NIw!ZabdzLjfhQbklI+y49VTzs+ujYc z(`u7u5EMLVQ!ekhIc6)aFQF&%X6>+{E?=gd3z{d{$|;r?-?m}9h5y5YG;)Bnw#Z4^ zYm@$qt&ZoSmo@j_G3xWSO4R0wYfL;89fzw63w$Y+iDJNzjc??CGij~xX}EHAKKQsJ zeF2p|h!zq|-z<6z|-}Ih(s_A1F5Eetm7lz2@LQ zfdKKtQeDHi?S|sN@RBS6&e~{E>m0 z@RP5(9EG)z!{fyYZ3>m(aa>z{8!9NFts)f4dLO*1+CVo8RaJf`eQuQ0Q_BuV{!>s- zjme1|C3OK>iB?75a~+11L2f59*Y}C>7M1{ z0NA1%3Po}T8M~<#l)UpLo{5e~&&?5nI6Fv0)#x!FaMhv;05fr9sW|WThlp(D_@)Rm zN`)@?+R$Om{5=dqf zDHikY>x>%VfiUADp1D51n4o4Cdr-Vy>?<9$r|y&Ft;5*YQijE7yn|H*szw1`e%xsy z`ZxlSD-?|$_M)P9T=Lg)ILjq+tm4QW=+TiOx!P)_gMk-#%_-=tV0Pn)4rK4$%_5nN z_f7yiU;lw2AcK>X4sagpO2@nu)9|HY7FV037I7k~L$xaire`W@eAqk@QQqo-tW#`r z*{`xDoW-Sc$!B9K-6Xotd3$ud@V_H^MD7KB1K=}u&jWkGlG^NU&|SKN z*$jyjU(A}i{~V;wxc|$d&MA?gEA9PSsX zic!ce!b9k^3k0R+RR-`8_Rm^FoNYjiB=8a4n)OKdB!g!+mU8<%Pqki@~W2 z>wb6&`&k3wfHZ3e{%a5d_b7+MQf1a+C)vyMlbf5v(kxQcvdW4j#Q2tjLLZo9{7MiO zH|)=W#aLO*02iY+2IjWDITGW<(jQ19E`wzC{gI34 zOioIV7rQ3L<3O)T19K*4LsNxj4Z^PlK}aHML-hiS_rP!|uQJSfq`DAm@4s&He?3Tk zY%n?VE?BO{@}S>uD^945`Np$n>JoN{ds2^w85mIx;j zM^r1vk_e}!(+e|KslOG(FlL1Cs4#q3K4|49V7D$Yw6G8-jlwgGXh=#FVIYjuz!00Y zWc_v!m4_peB}RTjc#qZgdc6;wn&`UI_kAXbO>K~kLh5fqiJUi#P0@yh}mJvJj~p!J|L&1 zWx=nFfss~vcrB;O%zhd7dt)KtJB@uL#t{wAIZeT@`^smXXrFaJB;@2fbG`ZdK|qp- zhA_S5J`}~L`jkm))+5G=7Xci?V1`LH-Szz+1aXMDbBrAUABIusv0+r>Y76-ar^=x* z1s3*#Wwv*6drOo*4B`>fo!Rz>563%7!;#g(Xk_Low1mi|$mD&`94^bhpt5>9a}?p`yAOt__j#$aJe?!xQMWIwQ(_B2(CsEYa@g ztkL?jAX+g&qFywNMT^*D^)dRQ(inwn6v!MMwPiLK6CG2Q%G(cuOjEb1aDSoKWXLSGC`~m+hNPo<# z%Yq+#@hHCl%xYoU;c%v8f>Mzf5-^v}pUhu@8oB!j{ww(Z$mLl}z+VLsi0yK6N<|_0i>0 zH*VCC^$>>!Wd$lukLG&?ky($BoqoT0K@y{s2epti?;)uL-*p>UKhJtBeNYgISbzt` zBl$w|_RP>Br5^fGq-rcR{hx>24VM2W71451VX5A^Z*!qRFZ zRA1o?LoxUmOFau09##%`vzDko0A1qedHAQQ517`f)ur{y$K#o?3>h@kaerFwyC;Qy z8Xp%$=vx)Y^Fea+=9rf&-BvVCo`G#ODbCu7#%RCpi>qj~r&SG0Z8Y?gwvdE4 zAb~5T??r`<|eau1YzXb_(&@Cl4_i?VrF=6uAyoU18eDdkrzM!ptp zgouS_OgJ$_pK#-&SeG2Zu)4UC+)vZHor`IgB_4J!2x51o40gsi^kIANWUv`^INRC#%2vTXUf(&HSlQJwW~K>8S_`&p+CJ$fchd`)t;dM;?Y2mZv3kpH z7&ArAaMG&f(oC)z^8kGjt%~`_ot#&fRx31E4Fc`S**w3GNv$zR-^sT$tFzrAh0qJo zKr9=CTnJhMXlI5n8mD=4CRrjvtp;d$mSv6bIQRRKZ(_!`%&OWtpLH}`5Z+nGZ$;^l zC1sW=I$Rw-1j#ovXv;>JN^p{GR7(pK%R4Umi7qdZM zZ@i0;j3h%eOR{R%qWdM_)6mXzcA}8WLwrh6X5=d#^jd9Th7(Pv=~==~0C*3ZxX&5t zma&{Zgm=l!L-N8P4#sNNBnM}AeJpZUyH06ORYtF#76m!+{IqkgOY)0?BpdD?tJ5U# zKGC_c%3gUU+dR)>P)7;(#a6-WuWInP{SiUZjpz`o>&6K75pafpZED1-VIZ#MOpU5Z zi~*Rg223~;)4`?Blpb`}^6-&kfnq2v9~8RM$v)lakVO#MjF0I89g0rV?B{!hQs85;-lMiiHRorN;#M_F8s z*iqCk?C=)-IOMp}FI5zf9aN%U7X?4)4_Owym;(E?loz>ut=~nqef4{}V}W>zbf6z* zn}Pa~@~GLVEC=(3E`%v0KJO33IZxcCXTGMnCaW(y3Z1n}TLtt1XL^j!QB% zr21Dq|NsAWPk}Ejd&j}mDe#)S3d}r2?zjypW0UytR*is~wxg{HA-XvHclfekacHjx zXT1g#J$sfd^}N8<=SAUOTwkud03ddHu$GV`#9Qb66^e$|dAdNh$P@VroRu5zOG~Gb zDkD;)*tsKU|A#lagSGGl;k^1YFB7EtI>=f>t3va+ac!Pza?pTBsdZOhoi{hf9ebQ| z$)n*cxrHjgeY~7(>~1A=j;lhf`Otho8{J~~J=DMV@i ztL6v6_4Awf!^n<+S(J(QQ;ZXlDV)@r$|+X@9(^wk(nl)*k3JegK5KKMLoy0^{83=U zfrvU3o)B^O9M(JPL$#5?+UE+`KI#s!p+T=nu}r(A;gSC_LGoz}(<5IgL?42Ka!ghg zc)b>OU@OX3)*eY}KU{vxj4T4NU~Ae=_Wa zyMI5j$oC7HOZ?6`)FB=u=LqL9YS)XT&XN4!$g2q7cxxHmyQFh27BXhOd*khqSisDo zem&kEM|f%^O)&px3U`qu6?!sIq~cJ$>5&jvKILCJ9FKYcuQO4%<4 zBAdB3)w36pvH~(TAVngAYZd5RtG|Ty#zMfBE4P4NS&(-kWdAC&PQ7rn6JLEfV+XPhZQnV){(; z?7Wj+gi+R^?c<=GvMc z8dcXT)f$IAAGHTKoX{G|Eo5?!4)`vE8yko9{q2pN?YGpol1zEOKU*w%U#O&$p*l@f zzO{XDxV?)jVERMcS2fr1{yS&72Ie!z?d7Xi`40Nz(&=z`KDhdt*I-3NPcB~(kL)M+ z@86#{KF}Z2EJEj+3}3JrbDXtDA4)%IC-QO7i)q&8xYzVN6o|6GyW3CJ_v=ZFeiCoB zroK#HZy!F12g%mP;RZColF|TUamIJ3p&Ib16_ zSC?4ez(&5_v;hw)jRnt7@ll!RS<5}Ip!8Std-ut0;NG++9G?{G7=J6fAiNloQ`R%Z z1yPxLg=@o+h3goXru%t4LwnwIqh0wc47>!YymF^(TmH)3!(>7(3k|=z?r`(tf|~uM z=!)Qs*Sv544)L+E_f^1AUgu%u4yA6M@oe(JGu0ZM31G=d1hABXf}+DjBJ;A1x?85;u}F-k?j;GsMtR3R(^A_SVeNdsz@Y zUP|kP|D56dQ?LdX(J?j*#_5)CCX`!w zc%%7~gYsC2Z_W)AZv#Ey8z}N{_U@eUK9*DRR12>q_wdi<*uo3pFl!YulxXxdIA8(3 zJHuD@!Fb2UkZ_iLG5!p0&V`F-TXkWxS4K!g&0@6x4w=SmZjwIs&CLigzBHjUe)5+V zj;%SD1LbOk4sl0X2Mv)i`*kQ38MFCyJ@`?(jIISoZGOjmq|1%lb$6ZiDZz1D0_B{M z+XPMFk(*DHH+CbR!|yxQ=v{a1`u~OwiywEASgbAZ74_T`(@PL0KbvSahYJ55C#*UKEY3 zDGMrN%VmQNf4Nz#BCKh%m^iI27!aY>{j`Ii0?wh^#A1q5oJ_mETz2=z9}WU!+%sQ5 zRB!{CMxz0ey{;;*kbyGRobY#!hP}t&L+q!N-4Sq2x8vra@JJ8^B%B&}pjLaCEkBQBW)iuQ3*1+omA_B)+0!MUSf@T6ySOs-s%L-29M2eW9 zE=d_pdr+sK?Eqb~HnLfL0^thlwXwbRo}m9F)x|&Fv{PN@{hcD3qZZFMquM-+$KcJt zp30;Z_24a^$I->OsF298MHDYKt!1?A z9)|hYxVk~OC@t?8B(;oiWXV0N|F!r&y10qdk=aDSFhW%-WbPO?Q8p#06}aA5;ZePS zo<){9ss)EOU;a=9e#TnsJQ|ce^5Pg4QKSR@-1aRfXUz1zeHW>TC$k+aFyRQr&OHZ> zl5d9OOxrqF5+s&GR9$<($L-IW-HXRy2hhLx2F4MEnx3}Lk<%<4;`q48WIDXxOUOtV zPJcFSb!T&s$luc`~#amXWKp3_THTxX5kr66*&lqw@@ ztgtf`T#NI?`8#Ir__EK2d?8ar!C;+p6@DUnu5z*7BvR2s8X6)leH zw#`(}G~L*_z47z`#R6<^A!FJ0;nQ*s(vH>L4P>}i5%A=gi!@}~QZsb+PHBgFYT=Wj ztXaohfsX=UW)05yP$1qQrTi;MHOgu6FkR3wZm4&6vi}a0i%#|hbrnqdcIl{d)9zyM zq&Y*mhMq_jMU+fs6VD(3Yao` zJyX!&vs2%AEPG{)WjM6An7Lbn#L~vNr)s{5WIg6?jaQRtnegU`U9w2cP|v`C@cJUh z>1wxvzi@952O`ZzySyR?>{{kzi1#8sG>V)Ig>`2grVYbQft zM>t|m+aG(kcM+ud#M5HSu)xVsSa;aT5HB{ZlVRO;GJHr(M61iywaOq_(JrHy;aG}7!7VZVrkLb{+bqG(^86g#yUIDyb1;dU1{8J`vzo(VR$weY z@pQ6PzqWB}=WwirdAKdv?`PTmkSDGm6y?q0TCMUBASS86sn)>i`(KklhD>5zqF~me zSjKWu?c%6RX#Nv3q;qy(^tVCvyC?YRnLP86Jp5Uom0Wtol~3Do*e@00Xc`VV9hFuy|GC7y5{X!jBZ-LM&*mk+!#a7F(H9u|0u*cA9kX|yc zeTM2W?Gt@el4xB83;Xt#qFX$t{iQ4?vBpBCaho-(#fCjP&PKF(8fkH@%4@5P*YQd1 zCs95$sp)Z0Fo%ulSk~1(JEtp8zJ7!7Pi?$A4hGr{WKu%Wj zyzKy&b|bD=Z{do0M!e*)#-|2(ddIrWS=6!1_ho3%pGS0aGnqze!SVJHXj#k(x;1}{ zy8R_l^-10thh#moTpJ;g z4gV#lmj^=Pa*)IqbC3DFS4LuJ!T9!aq+!g`WNECB!<^)_HbR8qt%BQh)*Sudr~$sT zvRa{Rz%*@eQHspJ_ZJ3swz!pSfEa#+@6*&bA_Fpg|5(}0smh-Vqri~(EJ;WzC; zPrTr{mqIaEQn14*Cu#cA`Qn&DIkGS4Pw_=IQ|emeqYI-8##dk`>e1gQlCtiL4+IhF zt?sFKDf_A3CivOhIH7$fnoaAF3K|_IG)mRHnVYsE z;d(`?F@-r+H?7lVA9t7H%<#QtpOTK2YU&^_XNYVmLo^eJ!UpR*2X#(Qme;d7a;AZY zIOu=gX$m$|uSK%RxjD5JuAmnb?bNRwCT5Zq-9M3GCF7K0J#s|qZy@@V_k%g>)ug1u ztw@UrYEs|iR+MqwtxCS(U1Q`s5i&$Fxm9j6|g{?xlnW&1y=UaGDNHB)s@lC1`{#lt|ByOf8euQ!}z0A+~^ zY7p`yq4hhMabX)Pc)j2^wkA0|PKd~Kdk?E_oC;+#5kIa*ScNl}RtbcV$@2!utPmnc zT>lBaj)-fdf{?TtpF$Q{WX(sG+)7!VMr3srkXG7q@#fAP`e!bzYe8X6&@C;j1{MR8 z6(2(2izO-F99wyB_JE8CS5_nSln_-asZXJ)<7t@V`0fuDSApr2lIm`iesWAwYYR)& z$_qiQyrs9ndvvZZ=NAZ$z|MnJs9^JfQz(RdJm-MnJ_X1ewNDU_9{S`Sc0_<2!qFSU z-d&1A_gzI<-uk?I`t*)dw|9T;vi3Hy1BvvS??AF>^o15>@I&Lj^qI-^prF!T-Zjshlz$G8RP|(gNwm1 zJ-dReskqmUbP?Q~?cNAq@{Kh~zlY3xy!#|#{|p$Xb;B&Vc?&-+h|^0L0N$@KC^Es%}jCylEUsdgNcgZoz6val6KQRC6|~^jd{dx1c^6N zgqi)%e&-S3OHFR9Lsoh)pZzYz5?LcMotn8nm3_t%F=N`F2+~f`L)x~~f^Oa(=F|NS zd$g$HBb5e@ZjWS<`Q|;S zH-q9<3^8eJ-D5kd{}8W)?Vvv>1GN{-Z!iWLPEss(AD&Mlz=B+oY&S&g6Jmx^bsc$= zDAK_?Mc^OY`3Zay@)bD6vgzB2;>Y(31v1eW$*8$Qq39iMo1KiXf>oxj8SME9t`vDUd=sy|;!ynqRP+3WeG4YHp>A3cPJ& zkzg7Hs6L94G<@=Rb;J{zzhPMqjtNr3+>clX{IT)H_Sh&x1&(WD+FW+avpgTM*e3v(ND%CY2b(|mkvELGQ$bcmu~U6`W#lab}EQ@&tSOm&}9!X*Q@?Md=!}}?Sk1{sT7i(yPj=uZ=i>=)th!J zjdEV!>Zk{6+30oOAJZBArG4Mmg2R~{#CnReY4-;*`v$W0TYjf`2+M%VOxck(KcaBu z7UsMG{A*GliYc$xKWTQ`@8bQ2CLFFf4X<&#hn4cEf8N8@BvVGFExGXp*Mra)v4$Q; zg3H_7#Z9ORhRE=thXeCybt6ivoB$WI2H?pc0Ho^W3eE35Nxm;3=UgPRg%GZULD)rY zDx}?^^?}~m?n=UT5Qs-uWyLREqJ{Tr(X8Dw=BINYC^5RgrX&w>;osOh^lvYCyk}7Lx>9AfKNjL0Mt}aYcalsQWl&q*vyB0)HCjFKOwnS%*=IMfP z>Kvjzv4RL*5N@`mVVzi#or}Q{+aM%dJ$E(j?+F!x|9V=aw#IiMTK$Wf&Ahk}`xTMV zDwr=Sc<{b6ItCu}odw959BgMv|Hh!&pty!Wt9jWk-iRW(8p8xLrY!iIsr7_w_WvTf zCVp&z!GWmiLgmBp3Y+ukP=u*?No~Ft@52^^>}~HZ1mD zhf{i-m4L=T>&IYPkUs` z6NQ^WB&>yTHoWJh{_;_{R~iMS0%@BwmJpuS!}Oi00mS|SFO>E0n6+5^XHX%2{0V49 zS}YkF@g+~L#o|@)0kv4X-uSakxrL4=S#JtTmatg7RwuPsyov%AOZLsQ7K?Xl=YLW1 z;+VK(v8*ypGW+M9<`6kR>9CFKsQg(}5i(o6g3@-8HO2Pk2B*FLaEhQDVW%JvW!R<= z3pRb&&M9O51mb?f{!foMU+S`PSlY(IsQ#Z`E@<`LA?xWgUgWo zj`>KJ9N+F{_I_}DQ^-hu2l9q`6Q#q8ll(fpQSW^|uNW=qXQIcxlA`7YO#O0>$Sv`b zMfQZULU~J;7iBID%hPU_k;9&|N=0|%Wr65>E^WFVX@5ctb$W$y&M|8dy@6d#drw4` zY4tjzv+iVt0$t5n4w-=X0xWni)^g~-=J0KuJ)CBf4T*k4N#uAqW3@bZ^$_)N#=7L= zT23)6y6(P%pkg2oXRPTwl@RxE#%hZx17hM}Ub1p3HL&Qqx_%`tPK&~GAv=qEutuqC z0?@Ij920=cNHrj<35*^wnw6GQu5)MLX1zenB}XFI13P^tZH>xK`=UjPV?eN~^lr)q z&W(#mMvrUg*Sggr)DSi%v;+lCzGR(j^@4(P| zP)W=)VwU6;-$o*MNhgNPgV#;IrdxcTFbm=J71^GeeR!UFxc&k$5nb}4B!kf%ijTzc z%xQN?7hMP3uPRJ@a~7xR=^TzplFDo!b=-axVH0-T#_M_Tnz4#*TNBFRoTzgjcd*_&r9Q zp=T=n2)@YCk+PK3lHxC{&-2w_^!=#c2_0!zSX#?YH45#An|eB-w2#Zx+GVL9yY2IH z@_(v`5DF;w9lkd8zPx4Fm0k}0x+sJHdXO*G_{6{6D6Xg==;LmUuR&G$<5{x7uIa&# z=tXoaZ$t;XA z_bTUyVZDeSTg+-ToPXw^X0f*5I8CG>KXJ{UA4loX&~}Q+u^k<>c`F#l4C1atbc6J> zHt5iScF>aZF}&jwaCy8O^!NkZp~jcK48zhRJjsE6kz*5`R^wy)%Q7aoo2=nF9{i|& zOyO9rLq4MA#`BtM5Z(@sXWADEBaydyc@s*7M{>9>Zwx!1P3ys2bGm5h4Mo1W5kNLa?cc6-A=|d{yAQ+SFE9)J=chN zK-Z$$dwFq@vn5iPMH2#gq>~)6uDOA9Rr}i;JKJwT)+&m%7x$WcU*<;ezEJCPzeY1u zstNTX23?bbn$+*_81>tOqjt+V1kuSdk%})n^VZo43Y$8^wT`X!IUl%C9~mb$h(|Ij zp7bDaiWVC0P3B+X<+5W3j`~VpYl4hV2q#85nYlUql4X`cloU}FPmQOJ6%2HMRASVl z-a^n(Q%Bc?+==`3WUe{iJU{REka?gXT^^4om&79+PrZ6o(PV!_S^92oA3h1IB-z?H z+(7tN4a0>s*chCfOsk2YtrZ!(rT(qTWO=cF)pzSMn-%_L?NF`B2lzldsJi;((Wsc$ z7DX6XCX@>9L7tqy1P#e)rvzHMN>k7~DL6^usosLWLOLOj(M*}?xG6GIdprn%rWWuc zvu|c~Zk9=oiH;kir!ZNi*T_MMw3ujvggND@xv+_G@?pyL5CxXc9&IEJ?IJvK4&5iN zW+%LoOIPTBHgx<*JT-_dq}Btp6xXtz8O)ni4?eIgHB+2Q{VqgW|B=C~ADJldpm2Fc z4|!UU3RDeKQ%O_GSx1X%KfpM8VL~;^+D<1MRr)dKUsd2aqz^Oq^IzdF@xvd~me+Yn zzJkn`%L|Wd&n;~R`xg1QuDmsxSL-1iz@M& za*kvkO9~A1msgC#WP`^U0#655Rv(5|#o9rz$6Hi=Gd_rIy$B{zj9E0{KYwB#Fl~+y zcZR#yF^N#>i6hQOfjCu8z=9+sh)C5QArZ)m3@tQi>%2<*jbP%m%d$oH9O_$wAW_Z$ zxrAbRVAlA1R0lRtc9^Ov#*5EVC)KFLl#5P}y0-?YGjiLBZf4`kr!qGf*DYQm0F#PP z4j$(4>>@Bd(m%Z%>6vcuNq=WCgR|ms(kbkc^Aw1HFCBzwU?8iRHT&Nd#DMl-E#ZzN z{5vZqKa%KA9Y`cz=bl335M}tT+ud1{{ux2iY4^Y)T}496M4rA3yuBC%rw$f`Cy@gX z$92^!rgMNG0Bk$k`f%2CH-dE2LyK-XQTg52o&I?R~d zY*@yh^qXg#W=f{snl;I950Xs#Ko-d~-}v-?Q8B$dYZI~iG<}CCHL{Eud2V>xlG(;O z>AkcwP}l`W@)Y17gEH~sTENw{HQvRb;R*K)?ttjR(;{{9 zfX%tg{Y4>N(q1$3PlU^w&VUgnO=*A4vz`!tq;T#C&Buoey@%~1G8T^8&>U<5<9l*V zvH7OgFY2lX>ct=#@rS-k%OfpGo5GhxeS_l(bu%66wBKdW$`)BMrM`v6)))VWr7Uuv z0(RwTky2CtE=dnj;9s$_S#|eO{!B0x|8@C_G&FK$1HKx9=x#9FMcJ4MZkChQxP&OO zv6r<&&OA5S1Med|iuS^WuGyN8s34T6EWMl14C(^fWXfz_vr%6md?mikGk?v}V$RQU z6LKgVk7gt&#N}XosjyZU8B%;o3yxn{#5kBEB9e@^kV~1sh;qqJsVSxz z%x|MRYK-_;eW{I=RJYQ(5UYkk%R8cSiw8CD(9t?Vnu#>rkcYn)m^oG9?# zm{v?_Fw*d0DEMf7(^zO_44`pUbgbv5pWV?xE~_s%h0Pkh-vY7X$JSEh9a!TCZjz6I z^mPQ1*8WSB4(xPssRF;t_?uj2U!HA#7k9<_+X`4m{}Zy}e#3XQ&l`NXo~!t8qi9I+ z>jF~atH0S-A*SIDoF}*$0b8Rj`2d z;~>%WPs1%72 z84ejPFLv|45Jk;tWp8)qX|;R*ww=!8*Wt~2tZpTBzJ4vY9zlXsUXOV3`|}al5r2qNe=kJO5joU7Nh$O& zh;-K>&P3WMIu5v5u`r&mbDETRX2l)|gzIwCe_cYk?jB>QMWI|;TC7ms6AC^;bT2}W z5EP~|=fh>tMIcv?R>RVH9+K$qa)>r@E?e(7Oek4rm_u1$y%oPyvd#y?zI8DI*5$>8 z>+ZC>5@+A)O68KvKC&e(t~K}sKyZwFTUp-4DxH{dkqX0Hd&=pi99I_CA{9OemQe^L zDXql?ks}HJI$K|tV=Sw#V)~$?0%qR_4NQ5bqdN`U>@Z6CIC+p;U+=&W8)>2XCS_Eh zt}u_@Pt?UfGeVoD)2kA15HMvQN< zIwxgXq`ol6vSYLU{3SUSsqewCOI*sk2*-VzEP1ndBMHUJVUVhz^H4X7hEYBXZbyHQ12LKFq>sUJ`IaTM0$CbiG|ySfX`dA#Okr&hXXMW`bZt&5fE9kMC!0vfPTyZ{&fj^7B%ZAsks>Z)kUmb zX!_^w8k5XGTzM)@@WI&Ae^Gs7vm@TOFk2$i|G52{1$&ld=1Sa&z-GX*d$MmK#-3Vj zc_m{HlC;qRNT2rO)a0BZaWeMg)HgPJ;&qMBo_JjkX7ot*s0-cKUG`jxv8P(a8i~#n zXL||Zk@A_2f;Ojev+Ovq8|92mmoVCoJMGH}!l>ebAiE@2kV|q!>D|xs@;I`!D&Ob_ zS+J882Sxqe4t*xRehuTG?3E4!yyH3FfgkOxzgX_<$``R4)bD7-3O$z8wU~Y zI|K$07Mf@RaJgU@*VjAE+PH>WoQ2B9!F5*Ea7W31of-AxBao64c3DE2C4`#*Qls#A zkY)wPgfuJoK%`Wi;PDdpePRxjRyGisbOy=oMDh~Q=NsjDwxOIkje0C-byAx=+t91S zO6}$t<`LV_tHTCd_hS~4T$8%D=AOuXSq|*0tb1iQ$z;N@H~cpUc%JUHYOt`D)#aMu zW@$4umIHD`_nM_`d{x`CfOk>qUN1YzC5t8WlIy%r)y;dC-z`DXR z5OW`tVJY(LI3U-ZPDn*uru~_~EcLLLN+`*J*-Ee`vr|x(XpwxxR~>olc}b|M#e!N_ zSq5rZgPpp>0XwpuFV}~ez%AGFi>@hrB?340z!p};Sz@}54$&4kuMph{d&UtaQ=*+j z1op@+ShqW6!}H&U)3}smr>xiREMP^LDO(e3QYU*UAztUgyCM&1Ev#XsLC4NwTe6#P zY;5f8?A_jYVyE6XsBi3VKH1o>AKcnGJm3(>%_r-dZ%j7#b`K8sH@0^VIaa3fJ7Fv$ zjc_0k9iPvtNgl~*_>n}kIyX0Gi7rk*a!$C%la1Y{levxk{f(#JI(O+9GIJ!y#Q-A$ zyOjNAi^2+UQ!(A1F3Vj><}P)6>ca*X%lZ6mIFrqJS)Rg4_AP8Ozhx|f(BZ@@F%%_#iv&wBr{Ndz^-cPOm(K_gU8k&kr5)&B3SY3UOM! z%c@Uq?QXy67LKPQTo|~zQ$(m@4^6*TA3}7vQepd>b#^3>xa#^GH>1$4;rKa)z^cX* z^-JR%ht(boYE*M_bAS8B#{SdE8|#pJf!e2`A7VBU4)pO*qgrPMSUm>V!a6ny*U?e7 zoHSUbOHM#|(U3lW{!rRkbO11jX+MYbcuP^)j5hAbX=|cp!;%|qb7H2rTZ7kQOsTG} zRXLwS9I6mgq7wcE6TwoF8@}`#!UK(hV~-+ENDPWkpQtZoSTxmuk=wXcmq*(%9%10D zRhB@Eq|S|kQdGOo<9aA>%tRW=VNtN%ke2$49EL=ijd3H`=ETFrjZJxDoE5m+=jFtT zPI`8y-D=!Jv^AZ-b^ZqQt)RAgM;-)rok7IyPpBCiXxX0zVu)!ry-~h7A8Eg>Ndd08 z``zCi0~kwyb>7J{qRX;h;}Y%e`I{49Rz3u2_Lgh6;S`EwD+k4_xCdbNiff(}JTLI= zMhv{_B4R{HI(gwl)|~pKaoj#1aMm9L;v7Zi($hRbX)%U6gBZ$pelvR3qz>@eJfvz7 zsOHJ0>bH++w53Vmtw|?s%}oWg7l#?nS+e|eFV!z``OLGIRq0%0{kgB$@>3PZQJiKX z^8ovYF~Dj|*wQBic)BhTkXiD(INV>ABgiEy4{@Vp(WoN%tm(Vu@p7C#O*%%r;LY!k zvwCfzvg@)s7QAEiJ@QeU$zM_y%}C9K9FCNWvNz8oRfg1j-sbgYLdN+- zhjMe6goHvL3*K81A$fDrHijL#YmDpk9-qn?-!M1 zZ=~WP-bRW=ZI*k}btjjrd3~9frEVWt6j7}dzbLgr zp<$)S;5dYAPd7x?MlLuve2O&)yf)DO`8c`ZIH&u2o)F5^A2lp}Ne(Vo}#|Uc{tFZ8`X3?d!oHKft!1 zk@}37b39|_#CkI67>S;6`Uf$_;e^v7?*ZETv^v$ah^*vIGz#9(&*S5zpMz1Pkr3M= zp9aQ`m{9YUmyx=>(G&S@sxh?gdPcq=#tNKvT4m1&cf(TF{D(g6RPa8|nmik`pwcLd z;{A-WD$g@g(2X$53i>eVX!1@%ePfh$Ys=LNB}?G$XM-fbx5!I9vdDF57NfsoE|tGm zG}I!EMVT4-zNm9rajNp@G?HwZSu(^Recyx~4zpdg=9~sB>g}ZBuqxvtSKV0kZ;Fcr z&KA9XDzS*#)On7RJSiNT_c5D0Z%;R`1LN^>8k^i>)ZdCSY8^*uXm^P_4*SZgMg#>dDVQ~EDIfPb zo!))q7FL>H48`!fILqEuGz*0*jIt~Id6ZRoi-OJtjJPO_t z(owNZhL-7y6U^u8H?_eN*Zz{`CIZ7i)ti>FZX4P9%%U{Tbp{FMZsOk zq#ZYY351pC=bCM*k3lw#+rdc(1a?<=K*axb2Fh3F@Oq*0aZ*3Iye4rAZdAMJ)69@h zEh_n3j17_}$_Pjf_|puL6*sf^x5r>ZV)}&&WzJTq4QYZKB-URIz9S&IGHvD%YSk0M z(fS~3NDDkSO=^6vY%3n5ktQY;*)`Wu`Wk#-^}z&0wHc}^~iq+(cOBAVywx0)!x zbQ)3h;QZThIG1acg#tK{@U;zR1ay64*t?r{qf&dntUMLN8){Pvu=|o^***7XYMpU# zck3o%XC~I~iSeOIx7@#)`OrT@UPhi*jRY=c4j8h>qAbYU-!ke8Ge9>TYEGk{RU_{9 z1o&>AI2(LjZ9wGpyHJyp;_hXl`Rp9rYn4yR!#y_3<;$BUs}N|^A95;O%N5U#AWaR_Fax>%%qo zKtnGIEnSR!ML_H1NkWJ0Ct?K`lh%`b+U=?$Alo15_OxCgy+oQg&;Ike9YQ+N(8Sun zJ>>Mx9K4r#J??kYdlDDtJpY7OO*yP%#>W0QY%KxxG)9M)mue7)8CP~cl8H5Y#muOQ zfflad0^@DPRCAR%*z$YbiYLMpaG0YiNI*rWT@Q!kuU-Abg6lexMmdPCaBz{?zKtlc zHi+h@m!!22&6torS+aqcd5SjPBmoF&)^hL$jF!_7Y32>;QST2Z{Md9O67rgIPlky4+j|s+3t#O!I74yUH9?%~ot&#T&V?kvH+%ZN((jY-nOxb@ylgP7K>x zty<&7mz&_q`8z)T^A=xDB_Vj7PXby%2yEWs%c;Xd)_Fgb*m_C1-WP+mx`Z2UUWM_D ztIc2uP~tOO5T|~Hy8@ThkSB(31w&|G%)aX9sD!M(Ah!9fX{%Lr1sfQ2Ls>eYnJn*M zd)BOvb$*+ZN;JHTVw28Y`+j~D{zaT~{De6CF}U+K=`y;D;70AXmJm{$w@H^(o(n2! zo*YDH=@K137ozbVQOx&WDWgmmTy*(13>%8CTHV~dT`ARF?+=vA#bxCa~3 z=u`XI_9v;2ysaVCTny%5Ta$u~AQsjfURH~L6a{+;TSVkPwE`&B9zfulG06+-bMYB1 zJ%il%dQ=~C5LorZo6TXCRYTSj%`WRb4Vmm4;vL-qIc4W!thHLsTAh@+>oPH-m&f7< z;>=yERc^Y>Nt4`S%|ZdZ>M8(q|&lHjuh7n7f2JQ`Aabw<%MBce2UfXt_{XO zYeI&{uQ2<^afahwEuHT`9MT+M?l7*k}=(y?$uMn%(pVa$aPY zSuzKXJRW=9nwYDx>$83^%GG6DW=CrRlZ=?0nsO=o9QicJ->)ml-$V|8_LRb2L=%?8O&$~+ciSQ*9M4c1PV!ftY6=4RH*$zBXe9?aY+i|1)&I+B_D2AU;99CE<5n!O-5 z@*)OwRe89p+BM|3gmTOBKa>NqIYSWKPmnwqa zs`X$H)WX$ixPcs)tu{5({FKC6^+w8d9#iuOjHnu`@kx2HBCk{M0@2%1o-D2|a-e4> zDEha!>o9MlUl$J|tp-liU6>c>sjK#d*{XL^_&ibmoG5HHsO(p=Ccq`gSzHtp0^N#i zJcxssX9Bpjgs}{?Wc#kP>}DO3(n|}e*sSO_9zZ+L@}!_6Vpcc^-6-3;G4L1G7gs7T zk>Cr&2U)OdP+7RQ2AW}NL~bza1HZ^B)*3-a(SB+Hh=U+3w$HM-tow~Se;VmD83mEq68;A8|Z$H_u-`v^QtS3)x*Ka4; zRc7G|Hx3@ceKI@s&BLTWn8(DzV;jE`z+^wEzm54uJnN+MhJ#H%qwjEp=z=#T^iy(R zB^w9XtVIvR+h)JtynqMzWUk+u=h!X!&#>hAUZy<0bvC!Qt}Og_=yHjn5Dvx9!g(1qnp0s)Jz`8tO=^v1k!{<) z=w+_9ynI;+r6aPUMk!Rfy&NeP1-v%i`nDWlw!57n@9SY+yYYYW?j~7MH#EwBaj84c zWkvtOlgZy1|B_-_T2(j7ugNQA^}nvd|1C%1BY5fgs9~j6UBV5U6xXcRw#{SQwWCiH zcA;g4Ow-~1_Vw%aeWB1k7bT=Zlh2btxn$$oVSPW@-aV-AAJWi}{L*b*gEp*hY(AOn z@7+%7Z?13Nf`+_yYj^W-JNvrLeNa78yq=?_46TCMK+fw#R+DOXLjFBpcW z{DYB*HH-M~CH!8{G1VhRcysT@jXLgRqSBT3;s5dhAy-h33+(nma%r1!8zjx7o8H%o zJ?S0uAvRpuM48wx9!c(>rnvrqKjQ9OByAKr!fbX)0wn~S1V*$|DD5nrYu@$We~}}F ziNldLaMo!6?EE+A2BaOOt`h`xsusqV`I01Ro9}6Ok zk&v}Mu}sEJwYGyYNk>QWae}aAz4GPqiwSQbsJ>j5Av7r?n>*4-v7}SgJ}de34(k0` zTt0C#7ir0CNh!orI#wdG`8kADNH)!4F~Q858b>Zg&4iQ|%7~J*`jF{J-Ry({S;JzB zl;olp0{V`iK>AA3?U?S7FN_RuOc>pIq>j|pl@i1Q@^@KM$konD1ij4-U6s&WdF)>@epW+({WTdq?MJRAfld^ijgHlt%~oJ+0AU_0UGeus5u) zCLI@8Jn*=vN%ygriM<3; z8t+aMJ5oZZj@Uw`5umrKB!k8K!TgjXV#$r)|GnslsA5Nn&iOGBVqa?@hh+oOWhKXM z4Eya9tk?ZP@zsQ%Jt)M`j@A(~QmlSh!y+cuk%m~P>FQ8Rs3RXM%3Q&gfJ`XYNMR){ z7xx_C@5cC1TSC;?>l8njg;v11l3iei z-ik1^WF0Pl6~GfRcnP5-QgQJLmVOsMzn{b4r3y{#mY|3DD>s?Tu=fX1UwHUgxI}9# z$z7!$#u=97I>5Z8tAs*>wOkc0Yo{B!OGt_!IO}Ey|BWbTYcTI#1{ze*GgiGPBF-7! z2dVl8{@4uY&QL*RNhsgfezFB8zVQEhgHd;(6sxE?dD7vu**!@I%#Ib89l>5@HnM7S z;;|%+WADD@-_AjNnGsh`FGgJbF*f3E!Lblmg>6$VruoJ|Tve>dG2~pmnhK(Shu3X1kFlY6u62{}QD z&BEHI@?!Lef5gV2cC$m-tW;i$HyrAr#-2{ENU70IuwKWGmH#fzpYK71&O}n8p0QXJ z>C-e^J4CNNu0`5Wv>U@H^5wL=^wNj_Wpq^47MCgy6EB66QY2#p`}Cix*Zzp&HN9 zzZQi;qXk4*CC7$d^kRy+do6~%-LFiEDgSOD+O%+QhBcQ=`RC*Q=TY1jaWwCx=%|Pc zA*tox;&^{)7;mf8SXhger@)!=mw6Iy_b0=cpHC>)dWP^qOMWwkFHVuJ^3i-X0pQs4 zP@TIIz93xW5pd`JcRtL)k_zf6txxHeYPS2r2IcNvG}vEq);nq+w~;ou)$5GT#Fp5q zi+Q+n8SaYy38Al5)&7HfC;T5qpTbXR775@QGA_11Z(!#PJU-C2@Cgj}&Fn=f%l zhwZb}*Z6_iwx|+kZLJ-^PG3qXA;}%Dqr6`8wl@lUmJJcRGZPM!MMSmnT(IaS5%|+M zXOFSyJl*JV;*VmSsIFl* zkj`zJLo`vk`TV@!LwZW(0?*T2--n?YSyO^=aL}MfZkMD-s+c`%A`OK{O+n{o z$`m?W6JCY?ki(RP6&1d_4E;r6_{zpe)Gv)zKWz?seYR;4s@XWEv=J}Fr`O(?WY_G8 zJw-BXW5MvMLXKQTa$Z$?XgcP1Svu)a9>Yqct4-B&le3jaxzx{ zXN%9{+4tt~EUSh*zvh6Aaxt$9HYb@5&Iupjp(2vlNOqU>5pQlBY;J7T5#V?{QMbda z&g=8U>|Pb@Ucs>-9*mwl_=j`Yy^9g_G*18dMBtU=d7vLQ`Z6E7l>>!lLusG2nuv8#GT=T} zOXC7vO~FpNy85Ew)|?{%?x#*CYyl`e@-fa&$M0s0#RYwwl*I^u6s(Po^dF6~cyV=w zU3O{*M4SL1Wy`^IO8R2+?fY`TQB{VRrrz1D*i2F#2#rr2DJW;z?GpX)xDxKfq+F1D zKon#tr&%{+=`Y8)fLp!SD{ln}JnmJflG?5H_pOtX-(kk2b4a-s)6eH1SF)K9gG&}Q zduVgts(|KT-^?;UiEHW4pPoQZa?F--M6`R??B3bpBnDBj9OZB`Y6noLb8I3j zgKEBbl_GXh>5|EtD5sB5T5Im{q+iXEt~JgRUy8&sAC+3+{n(UXPb^%_2PyfsY{@qv z6-BS&s_DH`(lVLgn1oCqH*#_lYee|x)j9=mEFeE$axAc}|r-RJJ!j)DRK6(t3$CD{NHY^2?d1O?O} z{s0t!DCwb~L^MEvz(NQ?yaNKnpI|(G9FINU_*~yjs?}ZloEiK3W;{N&$CPR3=<<~U z$Ew_a_=$0GBl@^8wg>ksIV~$csUJ~QlMKikPtmx>fV>JGE?(_#S%WQAPpHk>mO*qR zni;gSE@wu6BL>Zv^pLX&?xEnLalq|~)U39D>7+F+y4mXMrVvP`=98Iv3X{U)29bOX z%p-R~-1L4;4Rty-AIlon$)&|KoXe-rT7gUD=dAxAmsYeDd85E1G>GGidd4KD8)&Hc zXJlssAuSex$E0qOMa`*D6Gu1`x+9^y0ML`V71>TyKR@e_OJ@3{zt+ldu$9WSRw+3kZrZ7#i1YA>43t*fZ zV!SDYwXXKc(X|29By-pFB1!S9k7tKjNMYh%BP3_Er-RcS9==2gSdM7>#=%AYczkki z^`B86G2Sy^tlp^Ea8*f&l^F66p@$0zHg$#bsZF%3(6m+TMUl1xGKzRM@)<^vjQO6@ z9xw9&{dgN38pU5Tx+3SahLw~cv7vL|Q8aL3 zIadCOAfP)fcKpTj`gsaY`lX$-NDFqRcl^NU7`c?4c4=#XfvV@TqNrM$wZ;gvsq78Z zE41za-N)H+Z82V)QSB!93xiI1tR5z4fQAtC2vRFD`smKWUaBE2I4ji###oGlA?s2Q z)MMp-Ay6+JVv0T3fj)cN2s)ij@L}?*oWXyK(4XzirW1D{m37R0rTE4`$E=)7z~y;S zY!33)XDvctG0-32M>NeON3v&um(OZoro7VMEZtk=QY9K%n|jF;#|xS!fhqfUkUWdU zim$P0kfTYSk>3ul_4jZOaSE*@X@)}0uP}6>3TDj9{q5P9F@wInVR~Rk$^_sey;Zpf z{wK(mJxT%V65ldglINAo_EZ6dqH4>_TyJ$Q5CWM3>Z?jguHXM->E$zxmbCAP4|l~3*#S4xE>PqI?Ysl;OJP)K@~8*7>@?niEjbX_1aJlX!&~)rpSFf`oo6(CO(RHhMHKgRc!>PY z$1TR_vZ2q>t5@nsn#33vc$rv?$E@SNR_Npy*@SMFvI>*;ki=wOWt9C9vx2@8J+cKW zxM_{3`oq4y+>{ZyDv}MMMDrG|2(#d*PxFmfkgMIzfZ=ed_X`yRZa@Ys_STaJ90OAB zBFeqRH{!VSNW9>hf+uA2)1~T{fMurh{Gd-iqMGG0Le3 zX_{>fQ1qMCm7_N?K(j@%ni8NuvZmF%Go}}8h)Pisq^wl4n^0^`-CDNfn_dOfkzYJY z^F>8ri0FWdO(=a+q9c3!kII&Qz);aKoBMz*#nX=FY~5weK`Z~R54jU;oP-<+QY_LC z5(h3-OL}~aZiXdA>05=R2^}&DMyI+6@)wBCdNJcoJ|f_zxp48jiuaJ;d*A4g;++~~ zMU_gd#4e)W9cEgaeF{}75R39k)h%p|L8LEie#TlYJO7H&q^zDU9ebf$g=QY~ASB;D zUnFyp62(xW8=w^)TGzOiJy80{qFn4G>p;#URfd7pMIB#3_LxQoUO7ErXTy{ z=nkK~a@-u&-m-B^F9hb7$}52qwM-oiIqF~crcf)4aTFc8gE7dNhi7=&u9+KFtsAlU zHIZiaY;?;qq0;9FvH6rH^e%VL@uABgcX%KtidgRnT{3^CXwp4FDr{r zuA71tP^=rLXj$an{0LXBHe5qEP1n1#$pykGB=KsQh9kHPGXf*(QGOd(w#0>4=)Tg$*P_-tSXmXA(a;vigjS+p^4+xG-g~XZ$|#Y zz{)4S%v&0<;%4tZb8t$yHK4LwVE$?YDn*(^Dk@?C6{^r#zYD20fn{ChoNI@`QdH}} z%8yv-28x(_vCQTECB|xR#<50FnH<8^7fzF9g`WkdkQa1`-O;EZ-)}&siDdWBzYkeVvy6@Ee5Ne70Upmek)oEDRT`YgfM?9=}NERp~Ox zmchBi?+CwAj)q`aC%b_@ik3JIdV|=1sDM8&KD~j~6|N&x?nRfbl=s$2`v4i{Wtlqo z45B-w6#oy6LYEuRaHQvsYc5nG8cH;G>WUJIAzgGxNvBTAt>fe@CL>21*IeZeMD>}$ Isu&ah2SePWi2wiq diff --git a/local_database/DockerInfos.py b/local_database/DockerInfos.py index 3b1c071b..17180bab 100644 --- a/local_database/DockerInfos.py +++ b/local_database/DockerInfos.py @@ -1,5 +1,4 @@ from local_database.DTOs import DockerInfo, DockerfileInfo, HealthCheckInfo, VolumeInfo -from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME from util.helper_functions import get_from_env, project_path @@ -26,43 +25,6 @@ def get_database_docker_info() -> DockerInfo: ) ) - -def get_data_sources_data_dumper_info() -> DockerInfo: - return DockerInfo( - dockerfile_info=DockerfileInfo( - image_tag="datadumper", - dockerfile_directory=str(project_path( - "local_database", - "DataDumper" - )) - ), - volume_info=VolumeInfo( - host_path=str(project_path( - "local_database", - "DataDumper", - "dump" - )), - container_path="/dump" - ), - name="datadumper", - environment={ - "DUMP_HOST": get_from_env("PROD_DATA_SOURCES_HOST"), - "DUMP_USER": get_from_env("PROD_DATA_SOURCES_USER"), - "DUMP_PASSWORD": get_from_env("PROD_DATA_SOURCES_PASSWORD"), - "DUMP_NAME": get_from_env("PROD_DATA_SOURCES_DB"), - "DUMP_PORT": get_from_env("PROD_DATA_SOURCES_PORT"), - "RESTORE_HOST": get_from_env("POSTGRES_HOST"), - "RESTORE_USER": get_from_env("POSTGRES_USER"), - "RESTORE_PORT": get_from_env("POSTGRES_PORT"), - "RESTORE_DB_NAME": LOCAL_DATA_SOURCES_DB_NAME, - "RESTORE_PASSWORD": get_from_env("POSTGRES_PASSWORD"), - "DUMP_FILE": "/dump/data_sources_db_dump.sql", - "DUMP_SCHEMA_ONLY": "true" - }, - command="bash" - ) - - def get_source_collector_data_dumper_info() -> DockerInfo: return DockerInfo( dockerfile_info=DockerfileInfo( diff --git a/local_database/constants.py b/local_database/constants.py index d5c96e72..51147717 100644 --- a/local_database/constants.py +++ b/local_database/constants.py @@ -1,4 +1,3 @@ -LOCAL_DATA_SOURCES_DB_NAME = "test_data_sources_db" LOCAL_SOURCE_COLLECTOR_DB_NAME = "source_collector_test_db" DUMP_SH_DOCKER_PATH = "/usr/local/bin/dump.sh" diff --git a/local_database/create_database.py b/local_database/create_database.py index 58b15508..67eae70b 100644 --- a/local_database/create_database.py +++ b/local_database/create_database.py @@ -5,9 +5,7 @@ import psycopg2 from psycopg2 import sql -from local_database.DockerInfos import get_data_sources_data_dumper_info -from local_database.classes.DockerManager import DockerManager -from local_database.constants import LOCAL_DATA_SOURCES_DB_NAME, LOCAL_SOURCE_COLLECTOR_DB_NAME, RESTORE_SH_DOCKER_PATH +from local_database.constants import LOCAL_SOURCE_COLLECTOR_DB_NAME, RESTORE_SH_DOCKER_PATH # Defaults (can be overridden via environment variables) POSTGRES_HOST = os.getenv("POSTGRES_HOST", "host.docker.internal") @@ -52,47 +50,7 @@ def create_database(db_name): def main(): print("Creating databases...") - create_database(LOCAL_DATA_SOURCES_DB_NAME) create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME) if __name__ == "__main__": main() - parser = argparse.ArgumentParser() - - parser.add_argument( - "--use-shell", - action="store_true", - help="Use shell to run restore script" - ) - - args = parser.parse_args() - - if args.use_shell: - subprocess.run( - [ - "bash", - "-c", - RESTORE_SH_DOCKER_PATH - ], - env={ - "RESTORE_HOST": POSTGRES_HOST, - "RESTORE_USER": POSTGRES_USER, - "RESTORE_PORT": str(POSTGRES_PORT), - "RESTORE_DB_NAME": LOCAL_DATA_SOURCES_DB_NAME, - "RESTORE_PASSWORD": POSTGRES_PASSWORD - } - ) - os.system(RESTORE_SH_DOCKER_PATH) - exit(0) - - docker_manager = DockerManager() - data_sources_docker_info = get_data_sources_data_dumper_info() - container = docker_manager.run_container( - data_sources_docker_info, - force_rebuild=True - ) - try: - container.run_command(RESTORE_SH_DOCKER_PATH) - finally: - container.stop() - diff --git a/local_database/dump_data_sources_schema.py b/local_database/dump_data_sources_schema.py deleted file mode 100644 index 65079f53..00000000 --- a/local_database/dump_data_sources_schema.py +++ /dev/null @@ -1,21 +0,0 @@ -from local_database.DockerInfos import get_data_sources_data_dumper_info -from local_database.classes.DockerManager import DockerManager -from local_database.constants import DUMP_SH_DOCKER_PATH - - -def main(): - docker_manager = DockerManager() - data_sources_docker_info = get_data_sources_data_dumper_info() - container = docker_manager.run_container( - data_sources_docker_info, - force_rebuild=True - ) - try: - container.run_command(DUMP_SH_DOCKER_PATH) - finally: - container.stop() - - - -if __name__ == "__main__": - main() \ No newline at end of file From 4799d7eb8763f4776cd6ffb378c32bcb3d5cfdbf Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Apr 2025 17:42:45 -0400 Subject: [PATCH 146/182] fix(database): Remove FDW setup and tests --- .github/workflows/test_app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 73bc5738..ae0bb121 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -22,7 +22,7 @@ jobs: env: POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres - POSTGRES_DB: source_collector_test_db + POSTGRES_DB: postgres POSTGRES_HOST: postgres POSTGRES_PORT: 5432 GOOGLE_API_KEY: TEST From 79ccfba9fd1c2a87d172defc55cd9b0366e006aa Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:07:31 -0400 Subject: [PATCH 147/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index ae0bb121..a14541f0 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -35,7 +35,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r requirements.txt + pip install uv + uv pip install -r requirements.txt - name: Run tests run: | From 2f7abc220b267ec30923a724406f5f1caf94ddf6 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:11:55 -0400 Subject: [PATCH 148/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index a14541f0..c7283ec3 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -36,7 +36,7 @@ jobs: run: | python -m pip install --upgrade pip pip install uv - uv pip install -r requirements.txt + uv --system pip install -r requirements.txt - name: Run tests run: | From e8575eb6562b2212041dbfde986fbe01bac5aa29 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:13:18 -0400 Subject: [PATCH 149/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index c7283ec3..63e33382 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -35,8 +35,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install uv - uv --system pip install -r requirements.txt + pip install uv --system + uv system pip install -r requirements.txt - name: Run tests run: | From d8671056a341cdba0a208b8ad704b734e6d8ac52 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:15:45 -0400 Subject: [PATCH 150/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 63e33382..a14541f0 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -35,8 +35,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install uv --system - uv system pip install -r requirements.txt + pip install uv + uv pip install -r requirements.txt - name: Run tests run: | From 88bad7c035fa74720f91151410f888492539df88 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:21:31 -0400 Subject: [PATCH 151/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index a14541f0..e34ed390 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -36,8 +36,7 @@ jobs: run: | python -m pip install --upgrade pip pip install uv - uv pip install -r requirements.txt - + uv pip install --system -r requirements.txt - name: Run tests run: | pytest tests/test_automated From a6c79aa77bcfa8190866d6edcded19caeeff8f97 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:28:08 -0400 Subject: [PATCH 152/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index e34ed390..ea3ef6fc 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -32,11 +32,16 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Install uv + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install uv uv pip install --system -r requirements.txt + - name: Run tests run: | pytest tests/test_automated From babcde8ffeb9bbc955213d58a71c973f53357b67 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:07:31 -0400 Subject: [PATCH 153/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index ae0bb121..ea3ef6fc 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -32,10 +32,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Install uv + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -r requirements.txt + uv pip install --system -r requirements.txt - name: Run tests run: | From 7a8b3736f770a5d9c39e0b4c3c30846b70cd23a3 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:31:06 -0400 Subject: [PATCH 154/182] Update test_app.yml to use uv --- .github/workflows/test_app.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index ea3ef6fc..5b4da872 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -32,7 +32,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Install uv - name: Install uv and set the python version uses: astral-sh/setup-uv@v5 with: From ac85798660aa7e74e704738e9ea7e9c0635c9e18 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 18:36:13 -0400 Subject: [PATCH 155/182] Update Dockerfile to use uv --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6718a121..5352bc99 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for Source Collector FastAPI app FROM python:3.11.9-slim +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ # Set working directory WORKDIR /app @@ -8,7 +9,7 @@ WORKDIR /app COPY requirements.txt ./requirements.txt # Install dependencies -RUN pip install --no-cache-dir --prefer-binary -r requirements.txt +RUN uv pip install --system -r requirements.txt RUN playwright install chromium RUN playwright install-deps chromium From 220c3196705aac9d9e47939b0391d86a98c7b4fa Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 21:22:27 -0400 Subject: [PATCH 156/182] DRAFT --- api/routes/collector.py | 15 ++++++ collector_db/AsyncDatabaseClient.py | 53 +++++++++++++++++++ collector_db/DatabaseClient.py | 13 ++--- collector_manager/enums.py | 1 + core/AsyncCore.py | 12 ++++- core/DTOs/ManualBatchInputDTO.py | 24 +++++++++ core/DTOs/ManualBatchOutputDTO.py | 6 +++ pyproject.toml | 3 ++ .../integration/api/test_manual_batch.py | 18 +++++++ 9 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 core/DTOs/ManualBatchInputDTO.py create mode 100644 core/DTOs/ManualBatchOutputDTO.py create mode 100644 pyproject.toml create mode 100644 tests/test_automated/integration/api/test_manual_batch.py diff --git a/api/routes/collector.py b/api/routes/collector.py index e2789443..b7628c4f 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -6,6 +6,7 @@ from collector_manager.enums import CollectorType from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO from security_manager.SecurityManager import AccessInfo, get_access_info from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO from source_collectors.ckan.DTOs import CKANInputDTO @@ -122,4 +123,18 @@ async def start_muckrock_all_foia_collector( collector_type=CollectorType.MUCKROCK_ALL_SEARCH, dto=dto, user_id=access_info.user_id + ) + +@collector_router.post("/manual") +async def upload_manual_collector( + dto: ManualBatchInputDTO, + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), +) -> CollectorStartInfo: + """ + Uploads a manual "collector" with existing data + """ + return await core.upload_manual_batch( + dto=dto, + user_id=access_info.user_id ) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 46cd89db..4bc60de7 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -41,6 +41,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo @@ -1719,6 +1721,57 @@ async def add_all_annotations_to_url( ) session.add(agency_suggestion) + @session_manager + async def upload_manual_batch( + self, + session: AsyncSession, + user_id: int, + dto: ManualBatchInputDTO + ) -> ManualBatchOutputDTO: + batch = Batch( + strategy=CollectorType.MANUAL.value, + status=BatchStatus.READY_TO_LABEL.value, + parameters={ + "name": dto.name + }, + user_id=user_id + ) + session.add(batch) + await session.flush() + + batch_id = batch.id + url_ids = [] + + for entry in dto.entries: + url = URL( + url=entry.url, + name=entry.name, + description=entry.description, + batch_id=batch_id, + collector_metadata=entry.collector_metadata, + outcome=URLStatus.PENDING.value, + record_type=entry.record_type.value if entry.record_type is not None else None, + ) + + session.add(url) + try: + await session.flush() + except IntegrityError: + await session.rollback() + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"URL already exists: {entry.url}" + ) + await session.flush() + optional_metadata = URLOptionalDataSourceMetadata( + url_id=url.id, + record_formats=entry.record_formats, + data_portal_type=entry.data_portal_type, + supplying_entity=entry.supplying_entity, + ) + session.add(optional_metadata) + url_ids.append(url.id) + return ManualBatchOutputDTO(batch_id=batch_id, urls=url_ids) diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 3999dbc9..43ec9628 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -1,21 +1,22 @@ -from datetime import datetime, timedelta from functools import wraps from typing import Optional, List -from sqlalchemy import create_engine, Row +from sqlalchemy import create_engine from sqlalchemy.exc import IntegrityError -from sqlalchemy.orm import sessionmaker, scoped_session, aliased +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import sessionmaker, scoped_session from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.BatchInfo import BatchInfo -from collector_db.DTOs.DuplicateInfo import DuplicateInfo, DuplicateInsertInfo +from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo -from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo +from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus diff --git a/collector_manager/enums.py b/collector_manager/enums.py index 692b97e5..5b89ffe2 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -8,6 +8,7 @@ class CollectorType(Enum): MUCKROCK_COUNTY_SEARCH = "muckrock_county_search" MUCKROCK_ALL_SEARCH = "muckrock_all_search" CKAN = "ckan" + MANUAL = "manual" class URLStatus(Enum): PENDING = "pending" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 92f097db..2eba5d04 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -22,6 +22,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.DTOs.MessageResponse import MessageResponse from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType @@ -270,5 +272,13 @@ async def reject_url( user_id=access_info.user_id ) - + async def upload_manual_batch( + self, + dto: ManualBatchInputDTO, + user_id: int + ) -> ManualBatchOutputDTO: + return await self.adb_client.upload_manual_batch( + user_id=user_id, + dto=dto + ) diff --git a/core/DTOs/ManualBatchInputDTO.py b/core/DTOs/ManualBatchInputDTO.py new file mode 100644 index 00000000..9bb98755 --- /dev/null +++ b/core/DTOs/ManualBatchInputDTO.py @@ -0,0 +1,24 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from core.enums import RecordType + + +class ManualBatchInnerInputDTO(BaseModel): + url: str + name: Optional[str] = None + description: Optional[str] = None + collector_metadata: Optional[dict] = None + record_type: Optional[RecordType] = None + record_formats: Optional[list[str]] = None + data_portal_type: Optional[str] = None + supplying_entity: Optional[str] = None + + +class ManualBatchInputDTO(BaseModel): + name: str + entries: list[ManualBatchInnerInputDTO] = Field( + min_length=1, + max_length=1000 + ) \ No newline at end of file diff --git a/core/DTOs/ManualBatchOutputDTO.py b/core/DTOs/ManualBatchOutputDTO.py new file mode 100644 index 00000000..119359a6 --- /dev/null +++ b/core/DTOs/ManualBatchOutputDTO.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class ManualBatchOutputDTO(BaseModel): + batch_id: int + urls: list[int] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..161cc214 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[project] +name="source-collector" +version="0.1.0" \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py new file mode 100644 index 00000000..82fb4d91 --- /dev/null +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -0,0 +1,18 @@ +import pytest + + +@pytest.mark.asyncio +async def test_manual_batch(api_test_helper): + + manual_batch_name = "test_manual_batch" + + # Create 50 entries with just URL + + + # Create 50 entries with URL and all optional fields + + + # TODO: Continue later + + + raise NotImplementedError \ No newline at end of file From 25ced55b6af67688d1773f916e4c74c9f39db5ae Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:18:05 -0400 Subject: [PATCH 157/182] feat(app): Add `/collector/manual` endpoint --- ..._add_manual_strategy_to_batch_strategy_.py | 54 +++++++ api/routes/collector.py | 3 +- collector_db/AsyncDatabaseClient.py | 6 +- collector_db/DatabaseClient.py | 2 +- collector_db/models.py | 1 + core/AsyncCore.py | 4 +- ...OutputDTO.py => ManualBatchResponseDTO.py} | 2 +- .../api/helpers/RequestValidator.py | 60 ++++++- .../integration/api/test_manual_batch.py | 148 +++++++++++++++++- 9 files changed, 267 insertions(+), 13 deletions(-) create mode 100644 alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py rename core/DTOs/{ManualBatchOutputDTO.py => ManualBatchResponseDTO.py} (63%) diff --git a/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py new file mode 100644 index 00000000..c5af4d33 --- /dev/null +++ b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py @@ -0,0 +1,54 @@ +"""Add manual strategy to Batch strategy enum + +Revision ID: 028565b77b9e +Revises: e285e6e7cf71 +Create Date: 2025-05-03 09:56:51.134406 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '028565b77b9e' +down_revision: Union[str, None] = 'e285e6e7cf71' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + switch_enum_type( + table_name="batches", + column_name="strategy", + enum_name="batch_strategy", + new_enum_values=[ + "example", + "ckan", + "muckrock_county_search", + "auto_googler", + "muckrock_all_search", + "muckrock_simple_search", + "common_crawler", + "manual" + ], + ) + + +def downgrade() -> None: + switch_enum_type( + table_name="batches", + column_name="strategy", + enum_name="batch_strategy", + new_enum_values=[ + "example", + "ckan", + "muckrock_county_search", + "auto_googler", + "muckrock_all_search", + "muckrock_simple_search", + "common_crawler" + ], + ) diff --git a/api/routes/collector.py b/api/routes/collector.py index b7628c4f..16f5a900 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -7,6 +7,7 @@ from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from security_manager.SecurityManager import AccessInfo, get_access_info from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO from source_collectors.ckan.DTOs import CKANInputDTO @@ -130,7 +131,7 @@ async def upload_manual_collector( dto: ManualBatchInputDTO, core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), -) -> CollectorStartInfo: +) -> ManualBatchResponseDTO: """ Uploads a manual "collector" with existing data """ diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 4bc60de7..b110c614 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -42,7 +42,7 @@ from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo @@ -1727,7 +1727,7 @@ async def upload_manual_batch( session: AsyncSession, user_id: int, dto: ManualBatchInputDTO - ) -> ManualBatchOutputDTO: + ) -> ManualBatchResponseDTO: batch = Batch( strategy=CollectorType.MANUAL.value, status=BatchStatus.READY_TO_LABEL.value, @@ -1773,5 +1773,5 @@ async def upload_manual_batch( url_ids.append(url.id) - return ManualBatchOutputDTO(batch_id=batch_id, urls=url_ids) + return ManualBatchResponseDTO(batch_id=batch_id, urls=url_ids) diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 43ec9628..94320fbc 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -16,7 +16,7 @@ from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus diff --git a/collector_db/models.py b/collector_db/models.py index 42b113c6..b5e70cdc 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -41,6 +41,7 @@ class Batch(Base): 'muckrock_all_search', 'muckrock_simple_search', 'common_crawler', + 'manual', name='batch_strategy'), nullable=False) user_id = Column(Integer, nullable=False) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 2eba5d04..59a892ef 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -23,7 +23,7 @@ from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.MessageResponse import MessageResponse from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType @@ -276,7 +276,7 @@ async def upload_manual_batch( self, dto: ManualBatchInputDTO, user_id: int - ) -> ManualBatchOutputDTO: + ) -> ManualBatchResponseDTO: return await self.adb_client.upload_manual_batch( user_id=user_id, dto=dto diff --git a/core/DTOs/ManualBatchOutputDTO.py b/core/DTOs/ManualBatchResponseDTO.py similarity index 63% rename from core/DTOs/ManualBatchOutputDTO.py rename to core/DTOs/ManualBatchResponseDTO.py index 119359a6..f656fda0 100644 --- a/core/DTOs/ManualBatchOutputDTO.py +++ b/core/DTOs/ManualBatchResponseDTO.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -class ManualBatchOutputDTO(BaseModel): +class ManualBatchResponseDTO(BaseModel): batch_id: int urls: list[int] \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 28e4b4a3..07de3c95 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -1,6 +1,7 @@ from http import HTTPStatus from typing import Optional, Annotated +from fastapi import HTTPException from pydantic import BaseModel from starlette.testclient import TestClient @@ -24,6 +25,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.MessageResponse import MessageResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo @@ -32,7 +35,11 @@ class ExpectedResponseInfo(BaseModel): - status_code: Annotated[HTTPStatus, "The expected status code"] = HTTPStatus.OK + status_code: Annotated[ + HTTPStatus, + "The expected status code" + ] = HTTPStatus.OK + message: Optional[str] = None class RequestValidator: """ @@ -64,6 +71,31 @@ def open( assert response.status_code == expected_response.status_code, response.text return response.json() + def open_v2( + self, + method: str, + url: str, + params: Optional[dict] = None, + **kwargs + ) -> dict: + """ + Variation on open that raises an exception rather than check the status code + """ + if params: + kwargs["params"] = params + response = self.client.request( + method=method, + url=url, + headers={"Authorization": "Bearer token"}, # Fake authentication that is overridden during testing + **kwargs + ) + if response.status_code != HTTPStatus.OK: + raise HTTPException( + status_code=response.status_code, + detail=response.json() + ) + return response.json() + def get( self, url: str, @@ -94,6 +126,20 @@ def post( **kwargs ) + def post_v2( + self, + url: str, + params: Optional[dict] = None, + **kwargs + ) -> dict: + return self.open_v2( + method="POST", + url=url, + params=params, + **kwargs + ) + + def put( self, url: str, @@ -329,4 +375,14 @@ async def post_all_annotations_and_get_next( params=params, json=all_annotations_post_info.model_dump(mode='json') ) - return GetNextURLForAllAnnotationResponse(**data) \ No newline at end of file + return GetNextURLForAllAnnotationResponse(**data) + + async def submit_manual_batch( + self, + dto: ManualBatchInputDTO, + ) -> ManualBatchResponseDTO: + data = self.post_v2( + url="/collector/manual", + json=dto.model_dump(mode='json'), + ) + return ManualBatchResponseDTO(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py index 82fb4d91..e7c34af1 100644 --- a/tests/test_automated/integration/api/test_manual_batch.py +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -1,18 +1,160 @@ + import pytest +from fastapi import HTTPException + +from collector_db.models import Batch, URL, URLOptionalDataSourceMetadata +from collector_manager.enums import CollectorType +from core.DTOs.ManualBatchInputDTO import ManualBatchInnerInputDTO, ManualBatchInputDTO +from core.enums import RecordType @pytest.mark.asyncio async def test_manual_batch(api_test_helper): + ath = api_test_helper manual_batch_name = "test_manual_batch" # Create 50 entries with just URL - + dtos = [] + for i in range(50): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i}", + ) + dtos.append(dto) # Create 50 entries with URL and all optional fields + for i in range(50): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+50}", + name=manual_batch_name, + description=f"Description {i}", + collector_metadata={ + "name": f"Name {i}", + }, + record_type=RecordType.ARREST_RECORDS, + record_formats=[f"Record Format {i}"], + data_portal_type=f"Data Portal Type {i}", + supplying_entity=f"Supplying Entity {i}" + ) + dtos.append(dto) + + input_dto = ManualBatchInputDTO( + name=manual_batch_name, + entries=dtos + ) + + # Submit batch successfully + response = await ath.request_validator.submit_manual_batch(input_dto) + + # Check 100 URLs in url attribute + assert len(response.urls) == 100 + + # Get batch from database + adb_client = ath.adb_client() + batches = await adb_client.get_all(Batch) + + # Confirm only one batch + assert len(batches) == 1 + + batch: Batch = batches[0] + # Assert batch id matches response's batch id + assert batch.id == response.batch_id + # Assert strategy of manual + assert batch.strategy == CollectorType.MANUAL.value + # Assert parameters has name value of `test_manual_batch` + assert batch.parameters["name"] == manual_batch_name + # Assert has expected user id + assert batch.user_id == 1 + + # Get URLs from database + urls: list[URL] = await adb_client.get_all(URL) + + # Confirm 100 URLs + assert len(urls) == 100 + + def check_attributes( + object: URL or URLOptionalDataSourceMetadata, + attributes: list[str], + attributes_are_none: bool + ): + for attr in attributes: + if attributes_are_none: + if getattr(object, attr) is not None: + return False + else: + if getattr(object, attr) is None: + return False + return True + + def check_url(url: URL, url_only: bool): + assert url.batch_id == batch.id + assert url.url is not None + other_attributes = ["name", "description", "collector_metadata", "record_type"] + return check_attributes(url, other_attributes, url_only) + + + # Confirm 50 have only name value + count_only_name = 0 + for url in urls: + if check_url(url, True): + count_only_name += 1 + assert count_only_name == 50 + # Confirm 50 have all optional fields + count_all = 0 + for url in urls: + if check_url(url, False): + count_all += 1 + assert count_all == 50 + + # Get Optional URL Metadata from Database + opt_metadata: list[URLOptionalDataSourceMetadata] = await adb_client.get_all(URLOptionalDataSourceMetadata) + + # Confirm 100 + assert len(opt_metadata) == 100 + + def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: bool): + assert metadata.url_id is not None + other_attributes = ["record_formats", "data_portal_type", "supplying_entity"] + return check_attributes(metadata, other_attributes, no_optional) + + # Confirm 50 have nothing but URL id + count_only_url_id = 0 + for metadata in opt_metadata: + if check_opt_metadata(metadata, True): + count_only_url_id += 1 + assert count_only_url_id == 50 + + # Confirm 50 have all optional fields + count_all = 0 + for metadata in opt_metadata: + if check_opt_metadata(metadata, False): + count_all += 1 + assert count_all == 50 + # Insert another batch including good urls and one duplicate + more_dtos = [] + for i in range(49): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+100}", + ) + more_dtos.append(dto) - # TODO: Continue later + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/1", + ) + more_dtos.append(dto) + duplicate_input_dto = ManualBatchInputDTO( + name=manual_batch_name, + entries=more_dtos + ) - raise NotImplementedError \ No newline at end of file + # Submit batch + try: + response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) + except HTTPException as e: + # Confirm got a BAD REQUEST error identifying the correct duplicate URL + assert e.status_code == 400 + assert e.detail == { + "detail": 'URL already exists: https://example.com/1' + } From addc5f57df3bdd6f361654bfc60d1e8c98e17914 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:28:52 -0400 Subject: [PATCH 158/182] feat(app): Add `/collector/manual` endpoint --- collector_db/AsyncDatabaseClient.py | 23 ++++++++------- core/DTOs/ManualBatchResponseDTO.py | 3 +- .../integration/api/test_manual_batch.py | 28 ++++++++++--------- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index b110c614..d6d949ea 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1741,6 +1741,7 @@ async def upload_manual_batch( batch_id = batch.id url_ids = [] + duplicate_urls = [] for entry in dto.entries: url = URL( @@ -1753,15 +1754,13 @@ async def upload_manual_batch( record_type=entry.record_type.value if entry.record_type is not None else None, ) - session.add(url) - try: - await session.flush() - except IntegrityError: - await session.rollback() - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"URL already exists: {entry.url}" - ) + async with session.begin_nested(): + try: + session.add(url) + await session.flush() + except IntegrityError: + duplicate_urls.append(entry.url) + continue await session.flush() optional_metadata = URLOptionalDataSourceMetadata( url_id=url.id, @@ -1773,5 +1772,9 @@ async def upload_manual_batch( url_ids.append(url.id) - return ManualBatchResponseDTO(batch_id=batch_id, urls=url_ids) + return ManualBatchResponseDTO( + batch_id=batch_id, + urls=url_ids, + duplicate_urls=duplicate_urls + ) diff --git a/core/DTOs/ManualBatchResponseDTO.py b/core/DTOs/ManualBatchResponseDTO.py index f656fda0..b572fbb2 100644 --- a/core/DTOs/ManualBatchResponseDTO.py +++ b/core/DTOs/ManualBatchResponseDTO.py @@ -3,4 +3,5 @@ class ManualBatchResponseDTO(BaseModel): batch_id: int - urls: list[int] \ No newline at end of file + urls: list[int] + duplicate_urls: list[str] \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py index e7c34af1..e9a101eb 100644 --- a/tests/test_automated/integration/api/test_manual_batch.py +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -1,6 +1,5 @@ import pytest -from fastapi import HTTPException from collector_db.models import Batch, URL, URLOptionalDataSourceMetadata from collector_manager.enums import CollectorType @@ -139,10 +138,12 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo ) more_dtos.append(dto) - dto = ManualBatchInnerInputDTO( - url=f"https://example.com/1", - ) - more_dtos.append(dto) + for i in range(2): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+1}", + ) + more_dtos.append(dto) + duplicate_input_dto = ManualBatchInputDTO( name=manual_batch_name, @@ -150,11 +151,12 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo ) # Submit batch - try: - response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) - except HTTPException as e: - # Confirm got a BAD REQUEST error identifying the correct duplicate URL - assert e.status_code == 400 - assert e.detail == { - "detail": 'URL already exists: https://example.com/1' - } + response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) + # Check duplicate URLs + assert len(response.duplicate_urls) == 2 + assert response.duplicate_urls == ['https://example.com/1', 'https://example.com/2'] + assert len(response.urls) == 49 + + # Check 149 URLs in database + urls: list[URL] = await adb_client.get_all(URL) + assert len(urls) == 149 From 060bc11cce5650d00eaf48cc76d9c770c6544ed8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:32:19 -0400 Subject: [PATCH 159/182] feat(app): Add `/collector/manual` endpoint --- collector_db/AsyncDatabaseClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index d6d949ea..52ab2c9c 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -563,7 +563,7 @@ async def add_to_root_url_cache(self, session: AsyncSession, url: str, page_titl async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetURLsResponseInfo: statement = select(URL).options( selectinload(URL.error_info) - ) + ).order_by(URL.id) if errors: # Only return URLs with errors statement = statement.where( From 02f7b3a43a2db4f76ee620c08fc92d2c30ce5cc6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 21:23:30 -0400 Subject: [PATCH 160/182] Comment out URL relevance Huggingface Task Operator call --- core/TaskManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/TaskManager.py b/core/TaskManager.py index e72724fc..7bba4c67 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -99,7 +99,7 @@ async def get_url_miscellaneous_metadata_task_operator(self): async def get_task_operators(self) -> list[TaskOperatorBase]: return [ await self.get_url_html_task_operator(), - await self.get_url_relevance_huggingface_task_operator(), + # await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), await self.get_url_miscellaneous_metadata_task_operator(), From 18be3c9506bb4b64971ebed2536214ce8bd19f9d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 21:43:42 -0400 Subject: [PATCH 161/182] Comment out URL relevance Huggingface Task Operator call --- core/TaskManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/TaskManager.py b/core/TaskManager.py index 7bba4c67..4761a62b 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -101,7 +101,7 @@ async def get_task_operators(self) -> list[TaskOperatorBase]: await self.get_url_html_task_operator(), # await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), - await self.get_agency_identification_task_operator(), + # await self.get_agency_identification_task_operator(), await self.get_url_miscellaneous_metadata_task_operator(), await self.get_submit_approved_url_task_operator() ] From 1fcc9493bb58185932abd8fbefc03829b0908ad9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 4 May 2025 09:08:35 -0400 Subject: [PATCH 162/182] feat(app): Add `/search/url` endpoint --- api/main.py | 4 +++- api/routes/search.py | 20 ++++++++++++++++ collector_db/AsyncDatabaseClient.py | 16 +++++++++++++ core/AsyncCore.py | 3 +++ core/DTOs/SearchURLResponse.py | 8 +++++++ .../api/helpers/RequestValidator.py | 10 +++++++- .../integration/api/test_search.py | 23 +++++++++++++++++++ 7 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 api/routes/search.py create mode 100644 core/DTOs/SearchURLResponse.py create mode 100644 tests/test_automated/integration/api/test_search.py diff --git a/api/main.py b/api/main.py index ae74c914..1b80716e 100644 --- a/api/main.py +++ b/api/main.py @@ -11,6 +11,7 @@ from api.routes.collector import collector_router from api.routes.review import review_router from api.routes.root import root_router +from api.routes.search import search_router from api.routes.task import task_router from api.routes.url import url_router from collector_db.AsyncDatabaseClient import AsyncDatabaseClient @@ -128,7 +129,8 @@ async def redirect_docs(): annotate_router, url_router, task_router, - review_router + review_router, + search_router ] for router in routers: diff --git a/api/routes/search.py b/api/routes/search.py new file mode 100644 index 00000000..4513bb2f --- /dev/null +++ b/api/routes/search.py @@ -0,0 +1,20 @@ +from fastapi import APIRouter, Query, Depends + +from api.dependencies import get_async_core +from core.AsyncCore import AsyncCore +from core.DTOs.SearchURLResponse import SearchURLResponse +from security_manager.SecurityManager import get_access_info, AccessInfo + +search_router = APIRouter(prefix="/search", tags=["search"]) + + +@search_router.get("/url") +async def search_url( + url: str = Query(description="The URL to search for"), + access_info: AccessInfo = Depends(get_access_info), + async_core: AsyncCore = Depends(get_async_core), +) -> SearchURLResponse: + """ + Search for a URL in the database + """ + return await async_core.search_for_url(url) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 52ab2c9c..85d74146 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -43,6 +43,7 @@ GetURLsResponseInnerInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO +from core.DTOs.SearchURLResponse import SearchURLResponse from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo @@ -1778,3 +1779,18 @@ async def upload_manual_batch( duplicate_urls=duplicate_urls ) + @session_manager + async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: + query = select(URL).where(URL.url == url) + raw_results = await session.execute(query) + url = raw_results.scalars().one_or_none() + if url is None: + return SearchURLResponse( + found=False, + url_id=None + ) + return SearchURLResponse( + found=True, + url_id=url.id + ) + diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 59a892ef..f1d69fb2 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -25,6 +25,7 @@ from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.MessageResponse import MessageResponse +from core.DTOs.SearchURLResponse import SearchURLResponse from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType @@ -282,3 +283,5 @@ async def upload_manual_batch( dto=dto ) + async def search_for_url(self, url: str) -> SearchURLResponse: + return await self.adb_client.search_for_url(url) diff --git a/core/DTOs/SearchURLResponse.py b/core/DTOs/SearchURLResponse.py new file mode 100644 index 00000000..1a46c0be --- /dev/null +++ b/core/DTOs/SearchURLResponse.py @@ -0,0 +1,8 @@ +from typing import Optional + +from pydantic import BaseModel + + +class SearchURLResponse(BaseModel): + found: bool + url_id: Optional[int] = None \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 07de3c95..c2d246f5 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -30,6 +30,7 @@ from core.DTOs.MessageResponse import MessageResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.DTOs.SearchURLResponse import SearchURLResponse from core.enums import BatchStatus from util.helper_functions import update_if_not_none @@ -385,4 +386,11 @@ async def submit_manual_batch( url="/collector/manual", json=dto.model_dump(mode='json'), ) - return ManualBatchResponseDTO(**data) \ No newline at end of file + return ManualBatchResponseDTO(**data) + + async def search_url(self, url: str) -> SearchURLResponse: + data = self.get( + url=f"/search/url", + params={"url": url} + ) + return SearchURLResponse(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_search.py b/tests/test_automated/integration/api/test_search.py new file mode 100644 index 00000000..917690fc --- /dev/null +++ b/tests/test_automated/integration/api/test_search.py @@ -0,0 +1,23 @@ +import pytest + +from core.DTOs.SearchURLResponse import SearchURLResponse + + +@pytest.mark.asyncio +async def test_search_url(api_test_helper): + ath = api_test_helper + + # Create a batch with 1 URL + creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) + + # Search for that URL and locate it + response: SearchURLResponse = await ath.request_validator.search_url(url=creation_info.urls[0]) + + assert response.found + assert response.url_id == creation_info.url_ids[0] + + # Search for a non-existent URL + response: SearchURLResponse = await ath.request_validator.search_url(url="http://doesnotexist.com") + + assert not response.found + assert response.url_id is None \ No newline at end of file From e090cadb617c1d393d2019d648857e6313ae82bd Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 4 May 2025 10:56:02 -0400 Subject: [PATCH 163/182] feat(app): Add special error message for annotation user conflict --- core/AsyncCore.py | 36 +++++++---- core/classes/ErrorManager.py | 44 +++++++++++++ core/enums.py | 5 ++ core/helpers.py | 4 ++ .../api/helpers/RequestValidator.py | 4 +- .../integration/api/test_annotate.py | 62 +++++++++++++++++++ 6 files changed, 142 insertions(+), 13 deletions(-) create mode 100644 core/classes/ErrorManager.py diff --git a/core/AsyncCore.py b/core/AsyncCore.py index f1d69fb2..46ccca0d 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -1,6 +1,7 @@ from typing import Optional from pydantic import BaseModel +from sqlalchemy.exc import IntegrityError from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.BatchInfo import BatchInfo @@ -27,7 +28,8 @@ from core.DTOs.MessageResponse import MessageResponse from core.DTOs.SearchURLResponse import SearchURLResponse from core.TaskManager import TaskManager -from core.enums import BatchStatus, RecordType +from core.classes.ErrorManager import ErrorManager +from core.enums import BatchStatus, RecordType, AnnotationType from security_manager.SecurityManager import AccessInfo @@ -149,11 +151,17 @@ async def submit_url_relevance_annotation( url_id: int, relevant: bool ): - return await self.adb_client.add_user_relevant_suggestion( - user_id=user_id, - url_id=url_id, - relevant=relevant - ) + try: + return await self.adb_client.add_user_relevant_suggestion( + user_id=user_id, + url_id=url_id, + relevant=relevant + ) + except IntegrityError as e: + return await ErrorManager.raise_annotation_exists_error( + annotation_type=AnnotationType.RELEVANCE, + url_id=url_id + ) async def get_next_url_for_relevance_annotation( self, @@ -187,11 +195,17 @@ async def submit_url_record_type_annotation( url_id: int, record_type: RecordType, ): - await self.adb_client.add_user_record_type_suggestion( - user_id=user_id, - url_id=url_id, - record_type=record_type - ) + try: + return await self.adb_client.add_user_record_type_suggestion( + user_id=user_id, + url_id=url_id, + record_type=record_type + ) + except IntegrityError as e: + return await ErrorManager.raise_annotation_exists_error( + annotation_type=AnnotationType.RECORD_TYPE, + url_id=url_id + ) async def get_next_url_agency_for_annotation( diff --git a/core/classes/ErrorManager.py b/core/classes/ErrorManager.py new file mode 100644 index 00000000..ba763054 --- /dev/null +++ b/core/classes/ErrorManager.py @@ -0,0 +1,44 @@ +from enum import Enum +from http import HTTPStatus + +from fastapi import HTTPException +from pydantic import BaseModel + +from core.enums import AnnotationType + + +class ErrorTypes(Enum): + ANNOTATION_EXISTS = "ANNOTATION_EXISTS" + +class ErrorFormat(BaseModel): + code: ErrorTypes + message: str + + +class ErrorManager: + + @staticmethod + async def raise_error( + error_type: ErrorTypes, + message: str, + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST + ): + raise HTTPException( + status_code=status_code, + detail=ErrorFormat( + code=error_type, + message=message + ).model_dump(mode='json') + ) + + @staticmethod + async def raise_annotation_exists_error( + annotation_type: AnnotationType, + url_id: int + ): + await ErrorManager.raise_error( + error_type=ErrorTypes.ANNOTATION_EXISTS, + message=f"Annotation of type {annotation_type.value} already exists" + f" for url {url_id}", + status_code=HTTPStatus.CONFLICT + ) diff --git a/core/enums.py b/core/enums.py index 173c66e9..019572b8 100644 --- a/core/enums.py +++ b/core/enums.py @@ -1,5 +1,10 @@ from enum import Enum +class AnnotationType(Enum): + RELEVANCE = "RELEVANCE" + RECORD_TYPE = "RECORD_TYPE" + AGENCY = "AGENCY" + class BatchStatus(Enum): READY_TO_LABEL = "ready to label" diff --git a/core/helpers.py b/core/helpers.py index bac603bd..1fc51cde 100644 --- a/core/helpers.py +++ b/core/helpers.py @@ -1,3 +1,7 @@ +from http import HTTPStatus + +from fastapi import HTTPException + from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.enums import SuggestionType from core.exceptions import MatchAgencyError diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index c2d246f5..91d27729 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -246,7 +246,7 @@ def post_record_type_annotation_and_get_next( url_id: int, record_type_annotation_post_info: RecordTypeAnnotationPostInfo ) -> GetNextRecordTypeAnnotationResponseOuterInfo: - data = self.post( + data = self.post_v2( url=f"/annotate/record-type/{url_id}", json=record_type_annotation_post_info.model_dump(mode='json') ) @@ -257,7 +257,7 @@ def post_relevance_annotation_and_get_next( url_id: int, relevance_annotation_post_info: RelevanceAnnotationPostInfo ) -> GetNextRelevanceAnnotationResponseOuterInfo: - data = self.post( + data = self.post_v2( url=f"/annotate/relevance/{url_id}", json=relevance_annotation_post_info.model_dump(mode='json') ) diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index a03540a1..03088cd7 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -1,6 +1,7 @@ from http import HTTPStatus import pytest +from fastapi import HTTPException from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.DTOs.URLMapping import URLMapping @@ -11,6 +12,7 @@ from core.DTOs.GetNextURLForAgencyAnnotationResponse import URLAgencyAnnotationPostInfo from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.classes.ErrorManager import ErrorTypes from core.enums import RecordType, SuggestionType from core.exceptions import FailedValidationException from tests.helpers.complex_test_data_functions import AnnotateAgencySetupInfo, setup_for_annotate_agency, \ @@ -130,6 +132,36 @@ async def test_annotate_relevancy(api_test_helper): assert results[0].relevant is True +@pytest.mark.asyncio +async def test_annotate_relevancy_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_relevant_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + relevant=True + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_relevance_annotation_and_get_next( + url_id=creation_info.url_ids[0], + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + is_relevant=False + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" + + @pytest.mark.asyncio async def test_annotate_relevancy_no_html(api_test_helper): ath = api_test_helper @@ -250,6 +282,36 @@ async def test_annotate_record_type(api_test_helper): if result.url_id == inner_info_1.url_info.url_id: assert result.record_type == RecordType.BOOKING_REPORTS.value +@pytest.mark.asyncio +async def test_annotate_record_type_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_record_type_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_record_type_annotation_and_get_next( + url_id=creation_info.url_ids[0], + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" + + @pytest.mark.asyncio async def test_annotate_record_type_no_html_info(api_test_helper): ath = api_test_helper From 154895e2044791de8ae85ff43ad81b3d5deb7dfa Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 6 May 2025 09:20:50 -0400 Subject: [PATCH 164/182] DRAFT --- ...5e16e0738f_create_backlogsnapshot_table.py | 31 ++ ...e17c04_create_url_annotation_flags_view.py | 47 +++ api/main.py | 4 +- api/routes/metrics.py | 63 ++++ collector_db/AsyncDatabaseClient.py | 313 +++++++++++++++++- collector_db/StatementComposer.py | 10 +- collector_db/models.py | 9 +- core/AsyncCore.py | 24 ++ core/DTOs/GetMetricsBacklogResponse.py | 10 + .../GetMetricsBatchesAggregatedResponseDTO.py | 22 ++ .../GetMetricsBatchesBreakdownResponseDTO.py | 19 ++ .../GetMetricsURLsAggregatedResponseDTO.py | 14 + ...tMetricsURLsBreakdownPendingResponseDTO.py | 12 + ...etricsURLsBreakdownSubmittedResponseDTO.py | 10 + core/ScheduledTaskManager.py | 8 + 15 files changed, 591 insertions(+), 5 deletions(-) create mode 100644 alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py create mode 100644 alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py create mode 100644 api/routes/metrics.py create mode 100644 core/DTOs/GetMetricsBacklogResponse.py create mode 100644 core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py create mode 100644 core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py create mode 100644 core/DTOs/GetMetricsURLsAggregatedResponseDTO.py create mode 100644 core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py create mode 100644 core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py diff --git a/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py b/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py new file mode 100644 index 00000000..d6b118fb --- /dev/null +++ b/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py @@ -0,0 +1,31 @@ +"""Create BacklogSnapshot Table + +Revision ID: e55e16e0738f +Revises: 028565b77b9e +Create Date: 2025-05-06 08:16:29.385305 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e55e16e0738f' +down_revision: Union[str, None] = '028565b77b9e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'backlog_snapshot', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('count_pending_total', sa.Integer(), nullable=False), + sa.Column('created_at', sa.DateTime(), nullable=False), + ) + + +def downgrade() -> None: + op.drop_table('backlog_snapshot') diff --git a/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py new file mode 100644 index 00000000..f0250c06 --- /dev/null +++ b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py @@ -0,0 +1,47 @@ +"""Create URL Annotation Flags View + +Revision ID: f25852e17c04 +Revises: e55e16e0738f +Create Date: 2025-05-06 09:19:54.000410 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f25852e17c04' +down_revision: Union[str, None] = 'e55e16e0738f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + CREATE VIEW url_annotation_flags AS + SELECT + u.id, + u.outcome, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.automated_url_agency_suggestions auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id; + + """) + + +def downgrade() -> None: + op.execute("DROP VIEW url_annotation_flags;") diff --git a/api/main.py b/api/main.py index 1b80716e..94b52cd2 100644 --- a/api/main.py +++ b/api/main.py @@ -9,6 +9,7 @@ from api.routes.annotate import annotate_router from api.routes.batch import batch_router from api.routes.collector import collector_router +from api.routes.metrics import metrics_router from api.routes.review import review_router from api.routes.root import root_router from api.routes.search import search_router @@ -130,7 +131,8 @@ async def redirect_docs(): url_router, task_router, review_router, - search_router + search_router, + metrics_router ] for router in routers: diff --git a/api/routes/metrics.py b/api/routes/metrics.py new file mode 100644 index 00000000..ab548437 --- /dev/null +++ b/api/routes/metrics.py @@ -0,0 +1,63 @@ +from fastapi import APIRouter +from fastapi.params import Query + +from core.AsyncCore import AsyncCore +from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO +from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO +from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO +from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO +from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO +from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO +from security_manager.SecurityManager import AccessInfo + +metrics_router = APIRouter( + prefix="/metrics", + tags=["Metrics"], +) + + +@metrics_router.get("/batches/aggregated") +async def get_batches_aggregated_metrics( + core: AsyncCore, + access_info: AccessInfo +) -> GetMetricsBatchesAggregatedResponseDTO: + return await core.get_batches_aggregated_metrics() + +@metrics_router.get("/batches/breakdown") +async def get_batches_breakdown_metrics( + core: AsyncCore, + access_info: AccessInfo, + page: int = Query( + description="The page number", + default=1 + ) +) -> GetMetricsBatchesBreakdownResponseDTO: + return await core.get_batches_breakdown_metrics(page=page) + +@metrics_router.get("/urls/aggregate") +async def get_urls_aggregated_metrics( + core: AsyncCore, + access_info: AccessInfo +) -> GetMetricsURLsAggregatedResponseDTO: + return await core.get_urls_aggregated_metrics() + +@metrics_router.get("/urls/breakdown/submitted") +async def get_urls_breakdown_submitted_metrics( + core: AsyncCore, + access_info: AccessInfo +) -> GetMetricsURLsBreakdownSubmittedResponseDTO: + return await core.get_urls_breakdown_submitted_metrics() + +@metrics_router.get("/urls/breakdown/pending") +async def get_urls_breakdown_pending_metrics( + core: AsyncCore, + access_info: AccessInfo +) -> GetMetricsURLsBreakdownPendingResponseDTO: + return await core.get_urls_breakdown_pending_metrics() + +@metrics_router.get("/backlog") +async def get_backlog_metrics( + core: AsyncCore, + access_info: AccessInfo +) -> GetMetricsBacklogResponseDTO: + return await core.get_backlog_metrics() \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 85d74146..cf3cb47f 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -3,7 +3,7 @@ from typing import Optional, Type, Any, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete, insert from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased @@ -26,10 +26,19 @@ from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ - UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log + UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \ + BacklogSnapshot from collector_manager.enums import URLStatus, CollectorType from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo +from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO, \ + GetMetricsBatchesAggregatedInnerResponseDTO +from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO, \ + GetMetricsBatchesBreakdownInnerResponseDTO +from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO +from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO +from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -1794,3 +1803,303 @@ async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResp url_id=url.id ) + @session_manager + async def get_batches_aggregated_metrics(self, session: AsyncSession) -> GetMetricsBatchesAggregatedResponseDTO: + sc = StatementComposer + + # First, get all batches broken down by collector type and status + def batch_column(status: BatchStatus, label): + return sc.count_distinct( + case( + (Batch.status == status.value, + Batch.id) + ), + label=label + ) + + batch_count_subquery = select( + batch_column(BatchStatus.READY_TO_LABEL, label="done_count"), + batch_column(BatchStatus.ERROR, label="error_count"), + Batch.strategy, + ).group_by(Batch.strategy).subquery("batch_count") + + def url_column(status: URLStatus, label): + return sc.count_distinct( + case( + (URL.outcome == status.value, + URL.id) + ), + label=label + ) + + # Next, count urls + url_count_subquery = select( + Batch.strategy, + url_column(URLStatus.PENDING, label="pending_count"), + url_column(URLStatus.ERROR, label="error_count"), + url_column(URLStatus.SUBMITTED, label="submitted_count"), + url_column(URLStatus.REJECTED, label="rejected_count"), + ).join( + Batch, Batch.id == URL.batch_id + ).group_by( + Batch.strategy + ).subquery("url_count") + + # Combine + query = select( + Batch.strategy, + batch_count_subquery.c.done_count, + batch_count_subquery.c.error_count, + url_count_subquery.c.pending_count, + url_count_subquery.c.error_count, + url_count_subquery.c.submitted_count, + url_count_subquery.c.rejected_count, + ).join( + batch_count_subquery, + Batch.strategy == batch_count_subquery.c.strategy + ).join( + url_count_subquery, + Batch.strategy == url_count_subquery.c.strategy + ) + raw_results = await session.execute(query) + results = raw_results.all() + d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} + for result in results: + d[CollectorType(result.strategy)] = GetMetricsBatchesAggregatedInnerResponseDTO( + count_successful_batches=result.done_count, + count_failed_batches=result.error_count, + count_urls=result.pending_count + result.submitted_count + result.rejected_count + result.error_count, + count_urls_pending=result.pending_count, + count_urls_submitted=result.submitted_count, + count_urls_rejected=result.rejected_count, + count_urls_errors=result.error_count + ) + + total_batch_query = await session.execute( + select( + sc.count_distinct(Batch.id, label="count") + ) + ) + total_batch_count = total_batch_query.scalars().one_or_none() + if total_batch_count is None: + total_batch_count = 0 + + return GetMetricsBatchesAggregatedResponseDTO( + total_batches=total_batch_count, + by_strategy=d + ) + + @session_manager + async def get_batches_breakdown_metrics( + self, + session: AsyncSession, + page: int + ) -> GetMetricsBatchesBreakdownResponseDTO: + sc = StatementComposer + + main_query = select( + Batch.strategy, + Batch.id, + Batch.date_generated.label("created_at"), + ) + + def url_column(status: URLStatus, label): + return sc.count_distinct( + case( + (URL.outcome == status.value, + URL.id) + ), + label=label + ) + + count_query = select( + URL.batch_id, + sc.count_distinct(URL.id, label="count_total"), + url_column(URLStatus.PENDING, label="count_pending"), + url_column(URLStatus.SUBMITTED, label="count_submitted"), + url_column(URLStatus.REJECTED, label="count_rejected"), + url_column(URLStatus.ERROR, label="count_error"), + ).group_by( + URL.batch_id + ).subquery("url_count") + + query = (select( + main_query.c.strategy, + main_query.c.id, + main_query.c.created_at, + count_query.c.count_total, + count_query.c.count_pending, + count_query.c.count_submitted, + count_query.c.count_rejected, + count_query.c.count_error, + ).join( + count_query, + main_query.c.id == count_query.c.batch_id + ).offset( + (page - 1) * 100 + ).order_by( + main_query.c.created_at.asc() + )) + + raw_results = await session.execute(query) + results = raw_results.all() + batches: list[GetMetricsBatchesBreakdownInnerResponseDTO] = [] + for result in results: + dto = GetMetricsBatchesBreakdownInnerResponseDTO( + batch_id=str(result.id), + strategy=CollectorType(result.strategy), + created_at=result.created_at, + count_url_total=result.count_total, + count_url_pending=result.count_pending, + count_url_submitted=result.count_submitted, + count_url_rejected=result.count_rejected, + count_url_error=result.count_error, + ) + batches.append(dto) + return GetMetricsBatchesBreakdownResponseDTO( + batches=batches, + ) + + @session_manager + async def get_urls_breakdown_submitted_metrics( + self, + session: AsyncSession + ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: + pass + + @session_manager + async def get_urls_aggregated_metrics( + self, + session: AsyncSession + ) -> GetMetricsURLsAggregatedResponseDTO: + sc = StatementComposer + + oldest_pending_url_query = select( + URL.id, + URL.created_at + ).where( + URL.outcome == URLStatus.PENDING.value + ).order_by( + URL.created_at.asc() + ).limit(1) + + oldest_pending_url = await session.execute(oldest_pending_url_query) + oldest_pending_url = oldest_pending_url.scalars().one_or_none() + if oldest_pending_url is None: + oldest_pending_url_id = None + oldest_pending_created_at = None + else: + oldest_pending_url_id = oldest_pending_url.id + oldest_pending_created_at = oldest_pending_url.created_at + + def case_column(status: URLStatus, label): + return sc.count_distinct( + case( + (URL.outcome == status.value, + URL.id) + ), + label=label + ) + + count_query = select( + sc.count_distinct(URL.id, label="count"), + case_column(URLStatus.PENDING, label="count_pending"), + case_column(URLStatus.SUBMITTED, label="count_submitted"), + case_column(URLStatus.VALIDATED, label="count_validated"), + case_column(URLStatus.REJECTED, label="count_rejected"), + case_column(URLStatus.ERROR, label="count_error"), + ) + raw_results = await session.execute(count_query) + results = raw_results.all() + + return GetMetricsURLsAggregatedResponseDTO( + count_urls_total=results[0].count, + count_urls_pending=results[0].count_pending, + count_urls_submitted=results[0].count_submitted, + count_urls_validated=results[0].count_validated, + count_urls_rejected=results[0].count_rejected, + count_urls_errors=results[0].count_error, + oldest_pending_url_id=oldest_pending_url_id, + oldest_pending_url_created_at=oldest_pending_created_at, + ) + + @session_manager + async def get_urls_breakdown_pending_metrics( + self, + session: AsyncSession + ) -> GetMetricsURLsBreakdownPendingResponseDTO: + # I will need to get ways to identify the status of each url + + # Probably would benefit from a view of some sort + + + + pass + + @session_manager + async def get_backlog_metrics( + self, + session: AsyncSession + ) -> GetMetricsBacklogResponseDTO: + # 1. Create a subquery that assigns row_number() partitioned by week + weekly_snapshots_subq = ( + select( + BacklogSnapshot.id, + BacklogSnapshot.created_at, + BacklogSnapshot.count_pending_total, + func.date_trunc('week', BacklogSnapshot.created_at).label("week_start"), + func.row_number() + .over( + partition_by=func.date_trunc('week', BacklogSnapshot.created_at), + order_by=BacklogSnapshot.created_at.desc() + ) + .label("row_number") + ) + .subquery() + ) + + # 2. Filter for the top (most recent) row in each week + stmt = ( + select( + weekly_snapshots_subq.c.week_start, + weekly_snapshots_subq.c.created_at, + weekly_snapshots_subq.c.count_pending_total + ) + .where(weekly_snapshots_subq.c.row_number == 1) + .order_by(weekly_snapshots_subq.c.week_start) + ) + + raw_result = await session.execute(stmt) + results = raw_result.all() + final_results = [] + for result in results: + final_results.append( + GetMetricsBacklogResponseInnerDTO( + week_of=result.week_start, + count_pending_total=result.count_pending_total, + ) + ) + + return GetMetricsBacklogResponseDTO(entries=final_results) + + + @session_manager + async def populate_backlog_snapshot( + self, + session: AsyncSession + ): + sc = StatementComposer + # Get count of pending URLs + query = select( + sc.count_distinct(URL.id, label="count") + ).where( + URL.outcome == URLStatus.PENDING.value + ).subquery("pending_count") + + # insert count into snapshot + await session.execute( + insert(BacklogSnapshot).values( + count=query.c.count + ) + ) + diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index ca66f6ba..42c77ef7 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -114,4 +114,12 @@ def user_suggestion_not_exists( ) ) - return subquery \ No newline at end of file + return subquery + + @staticmethod + def count_distinct(field, label): + return func.count(func.distinct(field)).label(label) + + @staticmethod + def sum_distinct(field, label): + return func.sum(func.distinct(field)).label(label) \ No newline at end of file diff --git a/collector_db/models.py b/collector_db/models.py index b5e70cdc..d3c9b916 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -452,4 +452,11 @@ class UserRecordTypeSuggestion(Base): # Relationships - url = relationship("URL", back_populates="user_record_type_suggestion") \ No newline at end of file + url = relationship("URL", back_populates="user_record_type_suggestion") + +class BacklogSnapshot(Base): + __tablename__ = "backlog_snapshot" + + id = Column(Integer, primary_key=True, autoincrement=True) + count_pending_total = Column(Integer, nullable=False) + created_at = get_created_at_column() diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 46ccca0d..e7b7f534 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -15,6 +15,12 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO +from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO +from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO +from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO +from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO +from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -299,3 +305,21 @@ async def upload_manual_batch( async def search_for_url(self, url: str) -> SearchURLResponse: return await self.adb_client.search_for_url(url) + + async def get_batches_aggregated_metrics(self) -> GetMetricsBatchesAggregatedResponseDTO: + return await self.adb_client.get_batches_aggregated_metrics() + + async def get_batches_breakdown_metrics(self, page: int) -> GetMetricsBatchesBreakdownResponseDTO: + return await self.adb_client.get_batches_breakdown_metrics(page=page) + + async def get_urls_breakdown_submitted_metrics(self) -> GetMetricsURLsBreakdownSubmittedResponseDTO: + return await self.adb_client.get_urls_breakdown_submitted_metrics() + + async def get_urls_aggregated_metrics(self) -> GetMetricsURLsAggregatedResponseDTO: + return await self.adb_client.get_urls_aggregated_metrics() + + async def get_urls_breakdown_pending_metrics(self) -> GetMetricsURLsBreakdownPendingResponseDTO: + return await self.adb_client.get_urls_breakdown_pending_metrics() + + async def get_backlog_metrics(self) -> GetMetricsBacklogResponseDTO: + return await self.adb_client.get_backlog_metrics() \ No newline at end of file diff --git a/core/DTOs/GetMetricsBacklogResponse.py b/core/DTOs/GetMetricsBacklogResponse.py new file mode 100644 index 00000000..0df38324 --- /dev/null +++ b/core/DTOs/GetMetricsBacklogResponse.py @@ -0,0 +1,10 @@ +import datetime + +from pydantic import BaseModel + +class GetMetricsBacklogResponseInnerDTO(BaseModel): + week_of: datetime.date + count_pending_total: int + +class GetMetricsBacklogResponseDTO(BaseModel): + entries: list[GetMetricsBacklogResponseInnerDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py b/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py new file mode 100644 index 00000000..565ab208 --- /dev/null +++ b/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel + +from collector_manager.enums import CollectorType + + +class GetMetricsBatchesAggregatedInnerResponseDTO(BaseModel): + count_successful_batches: int + count_failed_batches: int + count_urls: int + count_urls_pending: int + count_urls_submitted: int + count_urls_rejected: int + count_urls_errors: int + + + +class GetMetricsBatchesAggregatedResponseDTO(BaseModel): + total_batches: int + by_strategy: dict[ + CollectorType, + GetMetricsBatchesAggregatedInnerResponseDTO + ] \ No newline at end of file diff --git a/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py b/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py new file mode 100644 index 00000000..5797ab54 --- /dev/null +++ b/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py @@ -0,0 +1,19 @@ +import datetime + +from pydantic import BaseModel + +from collector_manager.enums import CollectorType + + +class GetMetricsBatchesBreakdownInnerResponseDTO(BaseModel): + batch_id: str + strategy: CollectorType + created_at: datetime.datetime + count_url_total: int + count_url_pending: int + count_url_submitted: int + count_url_rejected: int + count_url_error: int + +class GetMetricsBatchesBreakdownResponseDTO(BaseModel): + batches: list[GetMetricsBatchesBreakdownInnerResponseDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsAggregatedResponseDTO.py b/core/DTOs/GetMetricsURLsAggregatedResponseDTO.py new file mode 100644 index 00000000..66009223 --- /dev/null +++ b/core/DTOs/GetMetricsURLsAggregatedResponseDTO.py @@ -0,0 +1,14 @@ +import datetime + +from pydantic import BaseModel + + +class GetMetricsURLsAggregatedResponseDTO(BaseModel): + count_urls_total: int + count_urls_pending: int + count_urls_submitted: int + count_urls_rejected: int + count_urls_validated: int + count_urls_errors: int + oldest_pending_url_created_at: datetime.datetime + oldest_pending_url_id: int \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py new file mode 100644 index 00000000..88b2a404 --- /dev/null +++ b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + +class GetMetricsURLsBreakdownPendingResponseInnerDTO(BaseModel): + week_created_at: str + count_pending_total: int + count_pending_relevant: int + count_pending_record_type: int + count_pending_agency: int + count_pending_final: int + +class GetMetricsURLsBreakdownPendingResponseDTO(BaseModel): + entries: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py new file mode 100644 index 00000000..7e17effe --- /dev/null +++ b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py @@ -0,0 +1,10 @@ +from datetime import datetime + +from pydantic import BaseModel + +class GetMetricsURLsBreakdownSubmittedInnerDTO(BaseModel): + week_of: datetime.date + count_submitted: int + +class GetMetricsURLsBreakdownSubmittedResponseDTO(BaseModel): + entries: list \ No newline at end of file diff --git a/core/ScheduledTaskManager.py b/core/ScheduledTaskManager.py index 0a407d9e..e0b87247 100644 --- a/core/ScheduledTaskManager.py +++ b/core/ScheduledTaskManager.py @@ -18,6 +18,7 @@ def __init__(self, async_core: AsyncCore): # Jobs self.run_cycles_job = None self.delete_logs_job = None + self.populate_backlog_snapshot_job = None def add_scheduled_tasks(self): self.run_cycles_job = self.scheduler.add_job( @@ -35,6 +36,13 @@ def add_scheduled_tasks(self): start_date=datetime.now() + timedelta(minutes=10) ) ) + self.populate_backlog_snapshot_job = self.scheduler.add_job( + self.async_core.adb_client.populate_backlog_snapshot, + trigger=IntervalTrigger( + days=1, + start_date=datetime.now() + timedelta(minutes=20) + ) + ) def shutdown(self): if self.scheduler.running: From ee4489e5d91be0e5690087e7877f9b1ba996babd Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 6 May 2025 12:43:57 -0400 Subject: [PATCH 165/182] DRAFT --- ...e17c04_create_url_annotation_flags_view.py | 5 +- ...007bbcce3_create_url_data_sources_table.py | 129 ++++++++++++++++++ collector_db/AsyncDatabaseClient.py | 73 +++++++++- collector_db/models.py | 48 ++++++- ...tMetricsURLsBreakdownPendingResponseDTO.py | 7 +- .../integration/api/test_metrics.py | 39 ++++++ 6 files changed, 285 insertions(+), 16 deletions(-) create mode 100644 alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py create mode 100644 tests/test_automated/integration/api/test_metrics.py diff --git a/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py index f0250c06..0da22094 100644 --- a/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py +++ b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py @@ -8,7 +8,6 @@ from typing import Sequence, Union from alembic import op -import sqlalchemy as sa # revision identifiers, used by Alembic. @@ -22,8 +21,7 @@ def upgrade() -> None: op.execute(""" CREATE VIEW url_annotation_flags AS SELECT - u.id, - u.outcome, + u.id as url_id, CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, @@ -39,7 +37,6 @@ def upgrade() -> None: LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id; - """) diff --git a/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py b/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py new file mode 100644 index 00000000..da896c1c --- /dev/null +++ b/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py @@ -0,0 +1,129 @@ +"""Create url_data_sources table + +Revision ID: 6f2007bbcce3 +Revises: f25852e17c04 +Create Date: 2025-05-06 11:15:24.485465 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = '6f2007bbcce3' +down_revision: Union[str, None] = 'f25852e17c04' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create url_data_sources_table table + op.create_table( + 'url_data_sources', + sa.Column( + 'id', + sa.Integer(), + primary_key=True + ), + sa.Column( + 'url_id', + sa.Integer(), + sa.ForeignKey( + 'urls.id', + ondelete='CASCADE' + ), + nullable=False + ), + sa.Column( + 'data_source_id', + sa.Integer(), + nullable=False + ), + sa.Column( + 'created_at', + sa.TIMESTAMP(), + nullable=False, + server_default=sa.text('now()') + ), + sa.UniqueConstraint('url_id', name='uq_url_id'), + sa.UniqueConstraint('data_source_id', name='uq_data_source_id') + ) + + # Migrate existing urls with a data source ID + op.execute(""" + INSERT INTO url_data_sources + (url_id, data_source_id) + SELECT id, data_source_id + FROM urls + WHERE data_source_id IS NOT NULL + """) + + # Drop existing data source ID column from urls table + op.drop_column('urls', 'data_source_id') + + # Add trigger to ensure linked URL has status of submitted + op.execute(""" + CREATE FUNCTION check_url_is_submitted() RETURNS trigger AS $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM urls WHERE id = NEW.url_id AND outcome != 'submitted' + ) THEN + RAISE EXCEPTION 'URL status is not submitted '; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + + op.execute(""" + CREATE TRIGGER check_url_is_submitted + BEFORE INSERT OR UPDATE ON url_data_sources + FOR EACH ROW + EXECUTE FUNCTION check_url_is_submitted(); + """) + + op.execute(""" + CREATE FUNCTION prevent_status_change_if_data_source_exists() RETURNS trigger AS $$ + BEGIN + IF OLD.outcome = 'submitted' AND NEW.outcome IS DISTINCT FROM OLD.status THEN + IF EXISTS ( + SELECT 1 FROM url_data_sources WHERE url_id = OLD.id + ) THEN + RAISE EXCEPTION 'Cannot change status from submitted: related child records exist.'; + END IF; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + + op.execute(""" + CREATE TRIGGER check_status_change + BEFORE UPDATE ON urls + FOR EACH ROW + EXECUTE FUNCTION prevent_status_change_if_data_source_exists(); + """) + + +def downgrade() -> None: + # Drop new trigger and function on URLS + op.execute(""" + DROP TRIGGER IF EXISTS check_url_is_submitted ON urls; + DROP FUNCTION IF EXISTS check_url_is_submitted; + DROP TRIGGER IF EXISTS check_status_change ON urls; + DROP FUNCTION IF EXISTS prevent_status_change_if_data_source_exists; + """) + + op.drop_table('url_data_sources') + + op.add_column( + 'urls', + sa.Column( + 'data_source_id', + sa.Integer(), + nullable=True + ) + ) + + diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index cf3cb47f..458cf75c 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -27,7 +27,7 @@ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \ - BacklogSnapshot + BacklogSnapshot, URLAnnotationFlag, URLDataSource from collector_manager.enums import URLStatus, CollectorType from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -37,8 +37,10 @@ from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO, \ GetMetricsBatchesBreakdownInnerResponseDTO from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO -from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO -from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO +from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO, \ + GetMetricsURLsBreakdownPendingResponseInnerDTO +from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO, \ + GetMetricsURLsBreakdownSubmittedInnerDTO from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -1965,7 +1967,33 @@ async def get_urls_breakdown_submitted_metrics( self, session: AsyncSession ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: - pass + # TODO: Wrong submitted at time: The created at does not indicate when it was submitted + + + # Build the query + week = func.date_trunc('week', URLDataSource.created_at) + query = ( + select( + week.label('week'), + func.count(URLDataSource.id).label('count_submitted'), + ) + .group_by(week) + .order_by(week.asc()) + ) + + # Execute the query + raw_results = await session.execute(query) + results = raw_results.all() + final_results: list[GetMetricsURLsBreakdownSubmittedInnerDTO] = [] + for result in results: + dto = GetMetricsURLsBreakdownSubmittedInnerDTO( + week_of=result.week, + count_submitted=result.count_submitted + ) + final_results.append(dto) + return GetMetricsURLsBreakdownSubmittedResponseDTO( + entries=final_results + ) @session_manager async def get_urls_aggregated_metrics( @@ -2028,13 +2056,44 @@ async def get_urls_breakdown_pending_metrics( self, session: AsyncSession ) -> GetMetricsURLsBreakdownPendingResponseDTO: - # I will need to get ways to identify the status of each url - # Probably would benefit from a view of some sort + flags = URLAnnotationFlag + url = URL + + week = func.date_trunc('week', url.created_at) + # Build the query + query = ( + select( + week.label('week'), + func.count(url.id).label('count_total'), + func.count(case((flags.has_user_record_type_annotation == True, 1))).label('user_record_type_count'), + func.count(case((flags.has_user_relevant_annotation == True, 1))).label('user_relevant_count'), + func.count(case((flags.has_user_agency_annotation == True, 1))).label('user_agency_count'), + ) + .where(url.outcome == URLStatus.PENDING.value) + .join(flags.url) + .group_by(week) + .order_by(week.asc()) + ) + # Execute the query and return the results + results = await session.execute(query) + all_results = results.scalars().all() + final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] - pass + for result in all_results: + dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( + week_created_at=result.week, + count_pending_total=result.count_total, + count_pending_relevant_user=result.auto_record_type_count, + count_pending_record_type_user=result.auto_relevant_count, + count_pending_agency_user=result.auto_agency_count, + ) + final_results.append(dto) + return GetMetricsURLsBreakdownPendingResponseDTO( + entries=final_results, + ) @session_manager async def get_backlog_metrics( diff --git a/collector_db/models.py b/collector_db/models.py index d3c9b916..375e5203 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -106,7 +106,6 @@ class URL(Base): record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) created_at = get_created_at_column() updated_at = get_updated_at_column() - data_source_id = Column(Integer, nullable=True) # Relationships batch = relationship("Batch", back_populates="urls") @@ -137,6 +136,15 @@ class URL(Base): confirmed_agencies = relationship( "ConfirmedURLAgency", ) + annotation_flags = relationship( + "URLAnnotationFlag", + back_populates="url" + ) + data_source = relationship( + "URLDataSource", + back_populates="url", + uselist=False + ) class URLOptionalDataSourceMetadata(Base): @@ -460,3 +468,41 @@ class BacklogSnapshot(Base): id = Column(Integer, primary_key=True, autoincrement=True) count_pending_total = Column(Integer, nullable=False) created_at = get_created_at_column() + +class URLAnnotationFlag(Base): + __tablename__ = "url_annotation_flags" + + url_id = Column( + Integer, + ForeignKey("urls.id"), + primary_key=True, + nullable=False + ) + has_auto_record_type_annotation = Column(Boolean, nullable=False) + has_auto_relevant_annotation = Column(Boolean, nullable=False) + has_auto_agency_annotation = Column(Boolean, nullable=False) + has_user_record_type_annotation = Column(Boolean, nullable=False) + has_user_relevant_annotation = Column(Boolean, nullable=False) + has_user_agency_annotation = Column(Boolean, nullable=False) + was_reviewed = Column(Boolean, nullable=False) + + # Relationships + url = relationship( + "URL", + back_populates="annotation_flags" + ) + +class URLDataSource(Base): + __tablename__ = "url_data_sources" + + id = Column(Integer, primary_key=True, autoincrement=True) + url_id = Column(Integer, ForeignKey("urls.id"), nullable=False) + data_source_id = Column(Integer, nullable=False) + created_at = get_created_at_column() + + # Relationships + url = relationship( + "URL", + back_populates="data_source", + uselist=False + ) \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py index 88b2a404..304555b0 100644 --- a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py @@ -3,10 +3,9 @@ class GetMetricsURLsBreakdownPendingResponseInnerDTO(BaseModel): week_created_at: str count_pending_total: int - count_pending_relevant: int - count_pending_record_type: int - count_pending_agency: int - count_pending_final: int + count_pending_relevant_user: int + count_pending_record_type_user: int + count_pending_agency_user: int class GetMetricsURLsBreakdownPendingResponseDTO(BaseModel): entries: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_metrics.py b/tests/test_automated/integration/api/test_metrics.py new file mode 100644 index 00000000..de5da4d3 --- /dev/null +++ b/tests/test_automated/integration/api/test_metrics.py @@ -0,0 +1,39 @@ +import pytest + + +@pytest.mark.asyncio +async def test_get_batches_aggregated_metrics(api_test_helper): + + + raise NotImplementedError + +@pytest.mark.asyncio +async def test_get_batches_breakdown_metrics(api_test_helper): + raise NotImplementedError + +@pytest.mark.asyncio +async def test_get_urls_breakdown_submitted_metrics(api_test_helper): + # Create URLs with submitted status, broken down in different amounts by different weeks + # And ensure the URLs are + +@pytest.mark.asyncio +async def test_get_urls_breakdown_pending_metrics(api_test_helper): + # Build URLs, broken down into three separate weeks, + # with each week having a different number of pending URLs + # with a different number of kinds of annotations per URLs + + # Additionally, add some URLs that are submitted, + # validated, errored, and ensure they are not counted + + + raise NotImplementedError + +@pytest.mark.asyncio +async def test_get_backlog_metrics(api_test_helper): + # Populate the backlog table and test that backlog metrics returned on a weekly basis + + # Ensure that multiple days in each week are added to the backlog table, with different values + + # Test that the count closest to the beginning of the week is returned for each week + + raise NotImplementedError From c6c829900fcd1c7b0f05370ad4f7f85ab48d0e15 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 14:07:21 -0400 Subject: [PATCH 166/182] DRAFT --- collector_db/AsyncDatabaseClient.py | 4 ++++ collector_db/DTOs/URLInfo.py | 1 + tests/test_automated/integration/api/test_metrics.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 458cf75c..d30d4aeb 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1324,6 +1324,8 @@ async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: collector_metadata=url_info.collector_metadata, outcome=url_info.outcome.value ) + if url_info.created_at is not None: + url_entry.created_at = url_info.created_at session.add(url_entry) await session.flush() return url_entry.id @@ -1370,6 +1372,8 @@ async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> in record_type_match_rate=batch_info.record_type_match_rate, record_category_match_rate=batch_info.record_category_match_rate, ) + if batch_info.date_generated is not None: + batch.date_generated = batch_info.date_generated session.add(batch) await session.flush() return batch.id diff --git a/collector_db/DTOs/URLInfo.py b/collector_db/DTOs/URLInfo.py index c47d2830..5a1d2221 100644 --- a/collector_db/DTOs/URLInfo.py +++ b/collector_db/DTOs/URLInfo.py @@ -13,4 +13,5 @@ class URLInfo(BaseModel): collector_metadata: Optional[dict] = None outcome: URLStatus = URLStatus.PENDING updated_at: Optional[datetime.datetime] = None + created_at: Optional[datetime.datetime] = None name: Optional[str] = None diff --git a/tests/test_automated/integration/api/test_metrics.py b/tests/test_automated/integration/api/test_metrics.py index de5da4d3..44eff414 100644 --- a/tests/test_automated/integration/api/test_metrics.py +++ b/tests/test_automated/integration/api/test_metrics.py @@ -3,7 +3,9 @@ @pytest.mark.asyncio async def test_get_batches_aggregated_metrics(api_test_helper): + # Create successful batches with URLs of different statuses + # Create failed batches raise NotImplementedError From 8d62278720d72f9812860b12ec44666c26b4accd Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 15:03:47 -0400 Subject: [PATCH 167/182] Convert to full uv/pyproject dependency management --- .github/workflows/test_app.yml | 4 - Dockerfile | 10 +- README.md | 6 + pyproject.toml | 53 +- requirements.txt | 54 - start_mirrored_local_app.py | 4 - uv.lock | 2824 ++++++++++++++++++++++++++++++++ 7 files changed, 2887 insertions(+), 68 deletions(-) delete mode 100644 requirements.txt create mode 100644 uv.lock diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 5b4da872..5cff8696 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -37,10 +37,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - uv pip install --system -r requirements.txt - - name: Run tests run: | pytest tests/test_automated diff --git a/Dockerfile b/Dockerfile index 5352bc99..58111591 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,12 +6,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ # Set working directory WORKDIR /app -COPY requirements.txt ./requirements.txt +COPY pyproject.toml uv.lock ./ # Install dependencies -RUN uv pip install --system -r requirements.txt -RUN playwright install chromium -RUN playwright install-deps chromium +RUN uv sync +# Must call from the root directory because uv does not add playwright to path +RUN /app/.venv/bin/playwright install-deps chromium +RUN /app/.venv/bin/playwright install chromium + # Copy project files COPY api ./api diff --git a/README.md b/README.md index 5a39d2bd..78b6fbfe 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,12 @@ core | A module which integrates other components, such as collector_manager and api | API for interacting with collector_manager, core, and collector_db local_database | Resources for setting up a test database for local development +## Installation + +``` +uv sync +``` + ## How to use 1. Create an .env file in this directory with these contents, or set the environment variable another way: `VUE_APP_PDAP_API_KEY=KeyGoesHere` diff --git a/pyproject.toml b/pyproject.toml index 161cc214..de0abfcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,52 @@ [project] -name="source-collector" -version="0.1.0" \ No newline at end of file +name = "data-source-identification" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "aiohttp~=3.11.11", + "alembic~=1.14.0", + "apscheduler~=3.11.0", + "asyncpg~=0.30.0", + "beautifulsoup4>=4.12.3", + "bs4~=0.0.2", + "ckanapi~=4.8", + "datasets~=2.19.1", + "docker~=7.1.0", + "fastapi[standard]~=0.115.6", + "from-root~=1.3.0", + "google-api-python-client>=2.156.0", + "httpx~=0.28.1", + "huggingface-hub~=0.28.1", + "keras~=2.15.0", + "lxml~=5.1.0", + "marshmallow~=3.23.2", + "numpy~=1.26.4", + "openai~=1.60.1", + "pandas~=2.2.3", + "playwright~=1.49.1", + "psycopg2-binary~=2.9.6", + "psycopg[binary]~=3.1.20", + "pydantic~=2.10.6", + "pyjwt~=2.10.1", + "python-dotenv~=1.0.1", + "requests~=2.32.3", + "sqlalchemy~=2.0.36", + "starlette~=0.45.3", + "tensorflow-cpu~=2.15.1", + "tensorflow-io-gcs-filesystem==0.31.0", + "tqdm>=4.64.1", + "transformers~=4.40.2", + "urllib3~=1.26.18", + "uvicorn~=0.34.0", +] + +[dependency-groups] +dev = [ + "docker>=7.1.0", + "pytest>=7.2.2", + "pytest-asyncio~=0.25.2", + "pytest-mock==3.12.0", + "pytest-timeout~=2.3.1", +] + + diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 911e66fb..00000000 --- a/requirements.txt +++ /dev/null @@ -1,54 +0,0 @@ -requests~=2.32.3 -python-dotenv~=1.0.1 -bs4~=0.0.2 -tqdm>=4.64.1 -pytest>=7.2.2 -pytest-mock==3.12.0 -urllib3~=1.26.18 -psycopg2-binary~=2.9.6 -pandas~=2.2.3 -datasets~=2.19.1 -# common_crawler only -huggingface-hub~=0.28.1 - -# html_tag_collector_only -lxml~=5.1.0 -beautifulsoup4>=4.12.3 - -# CKAN Collector -from-root~=1.3.0 - -# Google Collector -google-api-python-client>=2.156.0 # TODO: Check for delete -marshmallow~=3.23.2 - -sqlalchemy~=2.0.36 -fastapi[standard]~=0.115.6 -httpx~=0.28.1 -ckanapi~=4.8 # TODO: Check for delete -psycopg[binary]~=3.1.20 -APScheduler~=3.11.0 -alembic~=1.14.0 -asyncpg~=0.30.0 -pytest-asyncio~=0.25.2 -transformers~=4.40.2 -tensorflow-cpu~=2.15.1 -keras~=2.15.0 - - -# HTML Collector -playwright~=1.49.1 - -# Security Manager -PyJWT~=2.10.1 - -# Tests -pytest-timeout~=2.3.1 - -openai~=1.60.1 -aiohttp~=3.11.11 -uvicorn~=0.34.0 -pydantic~=2.10.6 -starlette~=0.45.3 -numpy~=1.26.4 -docker~=7.1.0 \ No newline at end of file diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index 236030d0..7bcd573f 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -1,10 +1,6 @@ """ Starts a local instance of the application utilizing a database mirrored from production. - -Because this is used for testing only, the docker module is not included in -requirements.txt, and must be installed separately via -`pip install docker` """ import uvicorn diff --git a/uv.lock b/uv.lock new file mode 100644 index 00000000..f2ea60ae --- /dev/null +++ b/uv.lock @@ -0,0 +1,2824 @@ +version = 1 +revision = 2 +requires-python = ">=3.11" +resolution-markers = [ + "python_full_version >= '3.13'", + "python_full_version == '3.12.*'", + "python_full_version < '3.12'", +] + +[[package]] +name = "absl-py" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/f0/e6342091061ed3a46aadc116b13edd7bb5249c3ab1b3ef07f24b0c248fc3/absl_py-2.2.2.tar.gz", hash = "sha256:bf25b2c2eed013ca456918c453d687eab4e8309fba81ee2f4c1a6aa2494175eb", size = 119982, upload_time = "2025-04-03T12:41:04.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/d4/349f7f4bd5ea92dab34f5bb0fe31775ef6c311427a14d5a5b31ecb442341/absl_py-2.2.2-py3-none-any.whl", hash = "sha256:e5797bc6abe45f64fd95dc06394ca3f2bedf3b5d895e9da691c9ee3397d70092", size = 135565, upload_time = "2025-04-03T12:41:03.172Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload_time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload_time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.11.18" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/e7/fa1a8c00e2c54b05dc8cb5d1439f627f7c267874e3f7bb047146116020f9/aiohttp-3.11.18.tar.gz", hash = "sha256:ae856e1138612b7e412db63b7708735cff4d38d0399f6a5435d3dac2669f558a", size = 7678653, upload_time = "2025-04-21T09:43:09.191Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/10/fd9ee4f9e042818c3c2390054c08ccd34556a3cb209d83285616434cf93e/aiohttp-3.11.18-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:427fdc56ccb6901ff8088544bde47084845ea81591deb16f957897f0f0ba1be9", size = 712088, upload_time = "2025-04-21T09:40:55.776Z" }, + { url = "https://files.pythonhosted.org/packages/22/eb/6a77f055ca56f7aae2cd2a5607a3c9e7b9554f1497a069dcfcb52bfc9540/aiohttp-3.11.18-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c828b6d23b984255b85b9b04a5b963a74278b7356a7de84fda5e3b76866597b", size = 471450, upload_time = "2025-04-21T09:40:57.301Z" }, + { url = "https://files.pythonhosted.org/packages/78/dc/5f3c0d27c91abf0bb5d103e9c9b0ff059f60cf6031a5f06f456c90731f42/aiohttp-3.11.18-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5c2eaa145bb36b33af1ff2860820ba0589e165be4ab63a49aebfd0981c173b66", size = 457836, upload_time = "2025-04-21T09:40:59.322Z" }, + { url = "https://files.pythonhosted.org/packages/49/7b/55b65af9ef48b9b811c91ff8b5b9de9650c71147f10523e278d297750bc8/aiohttp-3.11.18-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d518ce32179f7e2096bf4e3e8438cf445f05fedd597f252de9f54c728574756", size = 1690978, upload_time = "2025-04-21T09:41:00.795Z" }, + { url = "https://files.pythonhosted.org/packages/a2/5a/3f8938c4f68ae400152b42742653477fc625d6bfe02e764f3521321c8442/aiohttp-3.11.18-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0700055a6e05c2f4711011a44364020d7a10fbbcd02fbf3e30e8f7e7fddc8717", size = 1745307, upload_time = "2025-04-21T09:41:02.89Z" }, + { url = "https://files.pythonhosted.org/packages/b4/42/89b694a293333ef6f771c62da022163bcf44fb03d4824372d88e3dc12530/aiohttp-3.11.18-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8bd1cde83e4684324e6ee19adfc25fd649d04078179890be7b29f76b501de8e4", size = 1780692, upload_time = "2025-04-21T09:41:04.461Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ce/1a75384e01dd1bf546898b6062b1b5f7a59b6692ef802e4dd6db64fed264/aiohttp-3.11.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73b8870fe1c9a201b8c0d12c94fe781b918664766728783241a79e0468427e4f", size = 1676934, upload_time = "2025-04-21T09:41:06.728Z" }, + { url = "https://files.pythonhosted.org/packages/a5/31/442483276e6c368ab5169797d9873b5875213cbcf7e74b95ad1c5003098a/aiohttp-3.11.18-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25557982dd36b9e32c0a3357f30804e80790ec2c4d20ac6bcc598533e04c6361", size = 1621190, upload_time = "2025-04-21T09:41:08.293Z" }, + { url = "https://files.pythonhosted.org/packages/7b/83/90274bf12c079457966008a58831a99675265b6a34b505243e004b408934/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7e889c9df381a2433802991288a61e5a19ceb4f61bd14f5c9fa165655dcb1fd1", size = 1658947, upload_time = "2025-04-21T09:41:11.054Z" }, + { url = "https://files.pythonhosted.org/packages/91/c1/da9cee47a0350b78fdc93670ebe7ad74103011d7778ab4c382ca4883098d/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:9ea345fda05bae217b6cce2acf3682ce3b13d0d16dd47d0de7080e5e21362421", size = 1654443, upload_time = "2025-04-21T09:41:13.213Z" }, + { url = "https://files.pythonhosted.org/packages/c9/f2/73cbe18dc25d624f79a09448adfc4972f82ed6088759ddcf783cd201956c/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9f26545b9940c4b46f0a9388fd04ee3ad7064c4017b5a334dd450f616396590e", size = 1644169, upload_time = "2025-04-21T09:41:14.827Z" }, + { url = "https://files.pythonhosted.org/packages/5b/32/970b0a196c4dccb1b0cfa5b4dc3b20f63d76f1c608f41001a84b2fd23c3d/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:3a621d85e85dccabd700294494d7179ed1590b6d07a35709bb9bd608c7f5dd1d", size = 1728532, upload_time = "2025-04-21T09:41:17.168Z" }, + { url = "https://files.pythonhosted.org/packages/0b/50/b1dc810a41918d2ea9574e74125eb053063bc5e14aba2d98966f7d734da0/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9c23fd8d08eb9c2af3faeedc8c56e134acdaf36e2117ee059d7defa655130e5f", size = 1750310, upload_time = "2025-04-21T09:41:19.353Z" }, + { url = "https://files.pythonhosted.org/packages/95/24/39271f5990b35ff32179cc95537e92499d3791ae82af7dcf562be785cd15/aiohttp-3.11.18-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9e6b0e519067caa4fd7fb72e3e8002d16a68e84e62e7291092a5433763dc0dd", size = 1691580, upload_time = "2025-04-21T09:41:21.868Z" }, + { url = "https://files.pythonhosted.org/packages/6b/78/75d0353feb77f041460564f12fe58e456436bbc00cbbf5d676dbf0038cc2/aiohttp-3.11.18-cp311-cp311-win32.whl", hash = "sha256:122f3e739f6607e5e4c6a2f8562a6f476192a682a52bda8b4c6d4254e1138f4d", size = 417565, upload_time = "2025-04-21T09:41:24.78Z" }, + { url = "https://files.pythonhosted.org/packages/ed/97/b912dcb654634a813f8518de359364dfc45976f822116e725dc80a688eee/aiohttp-3.11.18-cp311-cp311-win_amd64.whl", hash = "sha256:e6f3c0a3a1e73e88af384b2e8a0b9f4fb73245afd47589df2afcab6b638fa0e6", size = 443652, upload_time = "2025-04-21T09:41:26.48Z" }, + { url = "https://files.pythonhosted.org/packages/b5/d2/5bc436f42bf4745c55f33e1e6a2d69e77075d3e768e3d1a34f96ee5298aa/aiohttp-3.11.18-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:63d71eceb9cad35d47d71f78edac41fcd01ff10cacaa64e473d1aec13fa02df2", size = 706671, upload_time = "2025-04-21T09:41:28.021Z" }, + { url = "https://files.pythonhosted.org/packages/fe/d0/2dbabecc4e078c0474abb40536bbde717fb2e39962f41c5fc7a216b18ea7/aiohttp-3.11.18-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d1929da615840969929e8878d7951b31afe0bac883d84418f92e5755d7b49508", size = 466169, upload_time = "2025-04-21T09:41:29.783Z" }, + { url = "https://files.pythonhosted.org/packages/70/84/19edcf0b22933932faa6e0be0d933a27bd173da02dc125b7354dff4d8da4/aiohttp-3.11.18-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d0aebeb2392f19b184e3fdd9e651b0e39cd0f195cdb93328bd124a1d455cd0e", size = 457554, upload_time = "2025-04-21T09:41:31.327Z" }, + { url = "https://files.pythonhosted.org/packages/32/d0/e8d1f034ae5624a0f21e4fb3feff79342ce631f3a4d26bd3e58b31ef033b/aiohttp-3.11.18-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3849ead845e8444f7331c284132ab314b4dac43bfae1e3cf350906d4fff4620f", size = 1690154, upload_time = "2025-04-21T09:41:33.541Z" }, + { url = "https://files.pythonhosted.org/packages/16/de/2f9dbe2ac6f38f8495562077131888e0d2897e3798a0ff3adda766b04a34/aiohttp-3.11.18-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e8452ad6b2863709f8b3d615955aa0807bc093c34b8e25b3b52097fe421cb7f", size = 1733402, upload_time = "2025-04-21T09:41:35.634Z" }, + { url = "https://files.pythonhosted.org/packages/e0/04/bd2870e1e9aef990d14b6df2a695f17807baf5c85a4c187a492bda569571/aiohttp-3.11.18-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b8d2b42073611c860a37f718b3d61ae8b4c2b124b2e776e2c10619d920350ec", size = 1783958, upload_time = "2025-04-21T09:41:37.456Z" }, + { url = "https://files.pythonhosted.org/packages/23/06/4203ffa2beb5bedb07f0da0f79b7d9039d1c33f522e0d1a2d5b6218e6f2e/aiohttp-3.11.18-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fbf91f6a0ac317c0a07eb328a1384941872f6761f2e6f7208b63c4cc0a7ff6", size = 1695288, upload_time = "2025-04-21T09:41:39.756Z" }, + { url = "https://files.pythonhosted.org/packages/30/b2/e2285dda065d9f29ab4b23d8bcc81eb881db512afb38a3f5247b191be36c/aiohttp-3.11.18-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ff5625413fec55216da5eaa011cf6b0a2ed67a565914a212a51aa3755b0009", size = 1618871, upload_time = "2025-04-21T09:41:41.972Z" }, + { url = "https://files.pythonhosted.org/packages/57/e0/88f2987885d4b646de2036f7296ebea9268fdbf27476da551c1a7c158bc0/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7f33a92a2fde08e8c6b0c61815521324fc1612f397abf96eed86b8e31618fdb4", size = 1646262, upload_time = "2025-04-21T09:41:44.192Z" }, + { url = "https://files.pythonhosted.org/packages/e0/19/4d2da508b4c587e7472a032290b2981f7caeca82b4354e19ab3df2f51d56/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:11d5391946605f445ddafda5eab11caf310f90cdda1fd99865564e3164f5cff9", size = 1677431, upload_time = "2025-04-21T09:41:46.049Z" }, + { url = "https://files.pythonhosted.org/packages/eb/ae/047473ea50150a41440f3265f53db1738870b5a1e5406ece561ca61a3bf4/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3cc314245deb311364884e44242e00c18b5896e4fe6d5f942e7ad7e4cb640adb", size = 1637430, upload_time = "2025-04-21T09:41:47.973Z" }, + { url = "https://files.pythonhosted.org/packages/11/32/c6d1e3748077ce7ee13745fae33e5cb1dac3e3b8f8787bf738a93c94a7d2/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0f421843b0f70740772228b9e8093289924359d306530bcd3926f39acbe1adda", size = 1703342, upload_time = "2025-04-21T09:41:50.323Z" }, + { url = "https://files.pythonhosted.org/packages/c5/1d/a3b57bfdbe285f0d45572d6d8f534fd58761da3e9cbc3098372565005606/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e220e7562467dc8d589e31c1acd13438d82c03d7f385c9cd41a3f6d1d15807c1", size = 1740600, upload_time = "2025-04-21T09:41:52.111Z" }, + { url = "https://files.pythonhosted.org/packages/a5/71/f9cd2fed33fa2b7ce4d412fb7876547abb821d5b5520787d159d0748321d/aiohttp-3.11.18-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ab2ef72f8605046115bc9aa8e9d14fd49086d405855f40b79ed9e5c1f9f4faea", size = 1695131, upload_time = "2025-04-21T09:41:53.94Z" }, + { url = "https://files.pythonhosted.org/packages/97/97/d1248cd6d02b9de6aa514793d0dcb20099f0ec47ae71a933290116c070c5/aiohttp-3.11.18-cp312-cp312-win32.whl", hash = "sha256:12a62691eb5aac58d65200c7ae94d73e8a65c331c3a86a2e9670927e94339ee8", size = 412442, upload_time = "2025-04-21T09:41:55.689Z" }, + { url = "https://files.pythonhosted.org/packages/33/9a/e34e65506e06427b111e19218a99abf627638a9703f4b8bcc3e3021277ed/aiohttp-3.11.18-cp312-cp312-win_amd64.whl", hash = "sha256:364329f319c499128fd5cd2d1c31c44f234c58f9b96cc57f743d16ec4f3238c8", size = 439444, upload_time = "2025-04-21T09:41:57.977Z" }, + { url = "https://files.pythonhosted.org/packages/0a/18/be8b5dd6b9cf1b2172301dbed28e8e5e878ee687c21947a6c81d6ceaa15d/aiohttp-3.11.18-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:474215ec618974054cf5dc465497ae9708543cbfc312c65212325d4212525811", size = 699833, upload_time = "2025-04-21T09:42:00.298Z" }, + { url = "https://files.pythonhosted.org/packages/0d/84/ecdc68e293110e6f6f6d7b57786a77555a85f70edd2b180fb1fafaff361a/aiohttp-3.11.18-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6ced70adf03920d4e67c373fd692123e34d3ac81dfa1c27e45904a628567d804", size = 462774, upload_time = "2025-04-21T09:42:02.015Z" }, + { url = "https://files.pythonhosted.org/packages/d7/85/f07718cca55884dad83cc2433746384d267ee970e91f0dcc75c6d5544079/aiohttp-3.11.18-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2d9f6c0152f8d71361905aaf9ed979259537981f47ad099c8b3d81e0319814bd", size = 454429, upload_time = "2025-04-21T09:42:03.728Z" }, + { url = "https://files.pythonhosted.org/packages/82/02/7f669c3d4d39810db8842c4e572ce4fe3b3a9b82945fdd64affea4c6947e/aiohttp-3.11.18-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a35197013ed929c0aed5c9096de1fc5a9d336914d73ab3f9df14741668c0616c", size = 1670283, upload_time = "2025-04-21T09:42:06.053Z" }, + { url = "https://files.pythonhosted.org/packages/ec/79/b82a12f67009b377b6c07a26bdd1b81dab7409fc2902d669dbfa79e5ac02/aiohttp-3.11.18-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:540b8a1f3a424f1af63e0af2d2853a759242a1769f9f1ab053996a392bd70118", size = 1717231, upload_time = "2025-04-21T09:42:07.953Z" }, + { url = "https://files.pythonhosted.org/packages/a6/38/d5a1f28c3904a840642b9a12c286ff41fc66dfa28b87e204b1f242dbd5e6/aiohttp-3.11.18-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9e6710ebebfce2ba21cee6d91e7452d1125100f41b906fb5af3da8c78b764c1", size = 1769621, upload_time = "2025-04-21T09:42:09.855Z" }, + { url = "https://files.pythonhosted.org/packages/53/2d/deb3749ba293e716b5714dda06e257f123c5b8679072346b1eb28b766a0b/aiohttp-3.11.18-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8af2ef3b4b652ff109f98087242e2ab974b2b2b496304063585e3d78de0b000", size = 1678667, upload_time = "2025-04-21T09:42:11.741Z" }, + { url = "https://files.pythonhosted.org/packages/b8/a8/04b6e11683a54e104b984bd19a9790eb1ae5f50968b601bb202d0406f0ff/aiohttp-3.11.18-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28c3f975e5ae3dbcbe95b7e3dcd30e51da561a0a0f2cfbcdea30fc1308d72137", size = 1601592, upload_time = "2025-04-21T09:42:14.137Z" }, + { url = "https://files.pythonhosted.org/packages/5e/9d/c33305ae8370b789423623f0e073d09ac775cd9c831ac0f11338b81c16e0/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c28875e316c7b4c3e745172d882d8a5c835b11018e33432d281211af35794a93", size = 1621679, upload_time = "2025-04-21T09:42:16.056Z" }, + { url = "https://files.pythonhosted.org/packages/56/45/8e9a27fff0538173d47ba60362823358f7a5f1653c6c30c613469f94150e/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:13cd38515568ae230e1ef6919e2e33da5d0f46862943fcda74e7e915096815f3", size = 1656878, upload_time = "2025-04-21T09:42:18.368Z" }, + { url = "https://files.pythonhosted.org/packages/84/5b/8c5378f10d7a5a46b10cb9161a3aac3eeae6dba54ec0f627fc4ddc4f2e72/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0e2a92101efb9f4c2942252c69c63ddb26d20f46f540c239ccfa5af865197bb8", size = 1620509, upload_time = "2025-04-21T09:42:20.141Z" }, + { url = "https://files.pythonhosted.org/packages/9e/2f/99dee7bd91c62c5ff0aa3c55f4ae7e1bc99c6affef780d7777c60c5b3735/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e6d3e32b8753c8d45ac550b11a1090dd66d110d4ef805ffe60fa61495360b3b2", size = 1680263, upload_time = "2025-04-21T09:42:21.993Z" }, + { url = "https://files.pythonhosted.org/packages/03/0a/378745e4ff88acb83e2d5c884a4fe993a6e9f04600a4560ce0e9b19936e3/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ea4cf2488156e0f281f93cc2fd365025efcba3e2d217cbe3df2840f8c73db261", size = 1715014, upload_time = "2025-04-21T09:42:23.87Z" }, + { url = "https://files.pythonhosted.org/packages/f6/0b/b5524b3bb4b01e91bc4323aad0c2fcaebdf2f1b4d2eb22743948ba364958/aiohttp-3.11.18-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9d4df95ad522c53f2b9ebc07f12ccd2cb15550941e11a5bbc5ddca2ca56316d7", size = 1666614, upload_time = "2025-04-21T09:42:25.764Z" }, + { url = "https://files.pythonhosted.org/packages/c7/b7/3d7b036d5a4ed5a4c704e0754afe2eef24a824dfab08e6efbffb0f6dd36a/aiohttp-3.11.18-cp313-cp313-win32.whl", hash = "sha256:cdd1bbaf1e61f0d94aced116d6e95fe25942f7a5f42382195fd9501089db5d78", size = 411358, upload_time = "2025-04-21T09:42:27.558Z" }, + { url = "https://files.pythonhosted.org/packages/1e/3c/143831b32cd23b5263a995b2a1794e10aa42f8a895aae5074c20fda36c07/aiohttp-3.11.18-cp313-cp313-win_amd64.whl", hash = "sha256:bdd619c27e44382cf642223f11cfd4d795161362a5a1fc1fa3940397bc89db01", size = 437658, upload_time = "2025-04-21T09:42:29.209Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ba/b5/6d55e80f6d8a08ce22b982eafa278d823b541c925f11ee774b0b9c43473d/aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54", size = 19424, upload_time = "2024-12-13T17:10:40.86Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597, upload_time = "2024-12-13T17:10:38.469Z" }, +] + +[[package]] +name = "alembic" +version = "1.14.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/09/f844822e4e847a3f0bd41797f93c4674cd4d2462a3f6c459aa528cdf786e/alembic-1.14.1.tar.gz", hash = "sha256:496e888245a53adf1498fcab31713a469c65836f8de76e01399aa1c3e90dd213", size = 1918219, upload_time = "2025-01-19T23:15:30.12Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/7e/ac0991d1745f7d755fc1cd381b3990a45b404b4d008fc75e2a983516fbfe/alembic-1.14.1-py3-none-any.whl", hash = "sha256:1acdd7a3a478e208b0503cd73614d5e4c6efafa4e73518bb60e4f2846a37b1c5", size = 233565, upload_time = "2025-01-19T23:15:32.523Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload_time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload_time = "2025-03-17T00:02:54.77Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload_time = "2025-03-17T00:02:52.713Z" }, +] + +[[package]] +name = "apscheduler" +version = "3.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzlocal" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4e/00/6d6814ddc19be2df62c8c898c4df6b5b1914f3bd024b780028caa392d186/apscheduler-3.11.0.tar.gz", hash = "sha256:4c622d250b0955a65d5d0eb91c33e6d43fd879834bf541e0a18661ae60460133", size = 107347, upload_time = "2024-11-24T19:39:26.463Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/ae/9a053dd9229c0fde6b1f1f33f609ccff1ee79ddda364c756a924c6d8563b/APScheduler-3.11.0-py3-none-any.whl", hash = "sha256:fc134ca32e50f5eadcc4938e3a4545ab19131435e851abb40b34d63d5141c6da", size = 64004, upload_time = "2024-11-24T19:39:24.442Z" }, +] + +[[package]] +name = "astunparse" +version = "1.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, + { name = "wheel" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/af/4182184d3c338792894f34a62672919db7ca008c89abee9b564dd34d8029/astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872", size = 18290, upload_time = "2019-12-22T18:12:13.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/03/13dde6512ad7b4557eb792fbcf0c653af6076b81e5941d36ec61f7ce6028/astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8", size = 12732, upload_time = "2019-12-22T18:12:11.297Z" }, +] + +[[package]] +name = "asyncpg" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/4c/7c991e080e106d854809030d8584e15b2e996e26f16aee6d757e387bc17d/asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851", size = 957746, upload_time = "2024-10-20T00:30:41.127Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/0e/f5d708add0d0b97446c402db7e8dd4c4183c13edaabe8a8500b411e7b495/asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a", size = 674506, upload_time = "2024-10-20T00:29:27.988Z" }, + { url = "https://files.pythonhosted.org/packages/6a/a0/67ec9a75cb24a1d99f97b8437c8d56da40e6f6bd23b04e2f4ea5d5ad82ac/asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed", size = 645922, upload_time = "2024-10-20T00:29:29.391Z" }, + { url = "https://files.pythonhosted.org/packages/5c/d9/a7584f24174bd86ff1053b14bb841f9e714380c672f61c906eb01d8ec433/asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a", size = 3079565, upload_time = "2024-10-20T00:29:30.832Z" }, + { url = "https://files.pythonhosted.org/packages/a0/d7/a4c0f9660e333114bdb04d1a9ac70db690dd4ae003f34f691139a5cbdae3/asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956", size = 3109962, upload_time = "2024-10-20T00:29:33.114Z" }, + { url = "https://files.pythonhosted.org/packages/3c/21/199fd16b5a981b1575923cbb5d9cf916fdc936b377e0423099f209e7e73d/asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056", size = 3064791, upload_time = "2024-10-20T00:29:34.677Z" }, + { url = "https://files.pythonhosted.org/packages/77/52/0004809b3427534a0c9139c08c87b515f1c77a8376a50ae29f001e53962f/asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454", size = 3188696, upload_time = "2024-10-20T00:29:36.389Z" }, + { url = "https://files.pythonhosted.org/packages/52/cb/fbad941cd466117be58b774a3f1cc9ecc659af625f028b163b1e646a55fe/asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d", size = 567358, upload_time = "2024-10-20T00:29:37.915Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0a/0a32307cf166d50e1ad120d9b81a33a948a1a5463ebfa5a96cc5606c0863/asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f", size = 629375, upload_time = "2024-10-20T00:29:39.987Z" }, + { url = "https://files.pythonhosted.org/packages/4b/64/9d3e887bb7b01535fdbc45fbd5f0a8447539833b97ee69ecdbb7a79d0cb4/asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e", size = 673162, upload_time = "2024-10-20T00:29:41.88Z" }, + { url = "https://files.pythonhosted.org/packages/6e/eb/8b236663f06984f212a087b3e849731f917ab80f84450e943900e8ca4052/asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a", size = 637025, upload_time = "2024-10-20T00:29:43.352Z" }, + { url = "https://files.pythonhosted.org/packages/cc/57/2dc240bb263d58786cfaa60920779af6e8d32da63ab9ffc09f8312bd7a14/asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3", size = 3496243, upload_time = "2024-10-20T00:29:44.922Z" }, + { url = "https://files.pythonhosted.org/packages/f4/40/0ae9d061d278b10713ea9021ef6b703ec44698fe32178715a501ac696c6b/asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737", size = 3575059, upload_time = "2024-10-20T00:29:46.891Z" }, + { url = "https://files.pythonhosted.org/packages/c3/75/d6b895a35a2c6506952247640178e5f768eeb28b2e20299b6a6f1d743ba0/asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a", size = 3473596, upload_time = "2024-10-20T00:29:49.201Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e7/3693392d3e168ab0aebb2d361431375bd22ffc7b4a586a0fc060d519fae7/asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af", size = 3641632, upload_time = "2024-10-20T00:29:50.768Z" }, + { url = "https://files.pythonhosted.org/packages/32/ea/15670cea95745bba3f0352341db55f506a820b21c619ee66b7d12ea7867d/asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e", size = 560186, upload_time = "2024-10-20T00:29:52.394Z" }, + { url = "https://files.pythonhosted.org/packages/7e/6b/fe1fad5cee79ca5f5c27aed7bd95baee529c1bf8a387435c8ba4fe53d5c1/asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305", size = 621064, upload_time = "2024-10-20T00:29:53.757Z" }, + { url = "https://files.pythonhosted.org/packages/3a/22/e20602e1218dc07692acf70d5b902be820168d6282e69ef0d3cb920dc36f/asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70", size = 670373, upload_time = "2024-10-20T00:29:55.165Z" }, + { url = "https://files.pythonhosted.org/packages/3d/b3/0cf269a9d647852a95c06eb00b815d0b95a4eb4b55aa2d6ba680971733b9/asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3", size = 634745, upload_time = "2024-10-20T00:29:57.14Z" }, + { url = "https://files.pythonhosted.org/packages/8e/6d/a4f31bf358ce8491d2a31bfe0d7bcf25269e80481e49de4d8616c4295a34/asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33", size = 3512103, upload_time = "2024-10-20T00:29:58.499Z" }, + { url = "https://files.pythonhosted.org/packages/96/19/139227a6e67f407b9c386cb594d9628c6c78c9024f26df87c912fabd4368/asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4", size = 3592471, upload_time = "2024-10-20T00:30:00.354Z" }, + { url = "https://files.pythonhosted.org/packages/67/e4/ab3ca38f628f53f0fd28d3ff20edff1c975dd1cb22482e0061916b4b9a74/asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4", size = 3496253, upload_time = "2024-10-20T00:30:02.794Z" }, + { url = "https://files.pythonhosted.org/packages/ef/5f/0bf65511d4eeac3a1f41c54034a492515a707c6edbc642174ae79034d3ba/asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba", size = 3662720, upload_time = "2024-10-20T00:30:04.501Z" }, + { url = "https://files.pythonhosted.org/packages/e7/31/1513d5a6412b98052c3ed9158d783b1e09d0910f51fbe0e05f56cc370bc4/asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590", size = 560404, upload_time = "2024-10-20T00:30:06.537Z" }, + { url = "https://files.pythonhosted.org/packages/c8/a4/cec76b3389c4c5ff66301cd100fe88c318563ec8a520e0b2e792b5b84972/asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e", size = 621623, upload_time = "2024-10-20T00:30:09.024Z" }, +] + +[[package]] +name = "attrs" +version = "25.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload_time = "2025-03-13T11:10:22.779Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload_time = "2025-03-13T11:10:21.14Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.13.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload_time = "2025-04-15T17:05:13.836Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" }, +] + +[[package]] +name = "bs4" +version = "0.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/aa/4acaf814ff901145da37332e05bb510452ebed97bc9602695059dd46ef39/bs4-0.0.2.tar.gz", hash = "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925", size = 698, upload_time = "2024-01-17T18:15:47.371Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/bb/bf7aab772a159614954d84aa832c129624ba6c32faa559dfb200a534e50b/bs4-0.0.2-py2.py3-none-any.whl", hash = "sha256:abf8742c0805ef7f662dce4b51cca104cffe52b835238afc169142ab9b3fbccc", size = 1189, upload_time = "2024-01-17T18:15:48.613Z" }, +] + +[[package]] +name = "cachetools" +version = "5.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload_time = "2025-02-20T21:01:19.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload_time = "2025-02-20T21:01:16.647Z" }, +] + +[[package]] +name = "certifi" +version = "2025.4.26" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/9e/c05b3920a3b7d20d3d3310465f50348e5b3694f4f88c6daf736eef3024c4/certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6", size = 160705, upload_time = "2025-04-26T02:12:29.51Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618, upload_time = "2025-04-26T02:12:27.662Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload_time = "2025-05-02T08:32:11.945Z" }, + { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload_time = "2025-05-02T08:32:13.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload_time = "2025-05-02T08:32:15.873Z" }, + { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload_time = "2025-05-02T08:32:17.283Z" }, + { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload_time = "2025-05-02T08:32:18.807Z" }, + { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload_time = "2025-05-02T08:32:20.333Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload_time = "2025-05-02T08:32:21.86Z" }, + { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload_time = "2025-05-02T08:32:23.434Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload_time = "2025-05-02T08:32:24.993Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload_time = "2025-05-02T08:32:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload_time = "2025-05-02T08:32:28.376Z" }, + { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload_time = "2025-05-02T08:32:30.281Z" }, + { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload_time = "2025-05-02T08:32:32.191Z" }, + { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload_time = "2025-05-02T08:32:33.712Z" }, + { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload_time = "2025-05-02T08:32:35.768Z" }, + { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload_time = "2025-05-02T08:32:37.284Z" }, + { url = "https://files.pythonhosted.org/packages/86/2d/fb55fdf41964ec782febbf33cb64be480a6b8f16ded2dbe8db27a405c09f/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214", size = 146626, upload_time = "2025-05-02T08:32:38.803Z" }, + { url = "https://files.pythonhosted.org/packages/8c/73/6ede2ec59bce19b3edf4209d70004253ec5f4e319f9a2e3f2f15601ed5f7/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a", size = 148567, upload_time = "2025-05-02T08:32:40.251Z" }, + { url = "https://files.pythonhosted.org/packages/09/14/957d03c6dc343c04904530b6bef4e5efae5ec7d7990a7cbb868e4595ee30/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd", size = 150957, upload_time = "2025-05-02T08:32:41.705Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c8/8174d0e5c10ccebdcb1b53cc959591c4c722a3ad92461a273e86b9f5a302/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981", size = 145408, upload_time = "2025-05-02T08:32:43.709Z" }, + { url = "https://files.pythonhosted.org/packages/58/aa/8904b84bc8084ac19dc52feb4f5952c6df03ffb460a887b42615ee1382e8/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c", size = 153399, upload_time = "2025-05-02T08:32:46.197Z" }, + { url = "https://files.pythonhosted.org/packages/c2/26/89ee1f0e264d201cb65cf054aca6038c03b1a0c6b4ae998070392a3ce605/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b", size = 156815, upload_time = "2025-05-02T08:32:48.105Z" }, + { url = "https://files.pythonhosted.org/packages/fd/07/68e95b4b345bad3dbbd3a8681737b4338ff2c9df29856a6d6d23ac4c73cb/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d", size = 154537, upload_time = "2025-05-02T08:32:49.719Z" }, + { url = "https://files.pythonhosted.org/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565, upload_time = "2025-05-02T08:32:51.404Z" }, + { url = "https://files.pythonhosted.org/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357, upload_time = "2025-05-02T08:32:53.079Z" }, + { url = "https://files.pythonhosted.org/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776, upload_time = "2025-05-02T08:32:54.573Z" }, + { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" }, + { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" }, + { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" }, + { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" }, + { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" }, + { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" }, + { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" }, + { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" }, + { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" }, + { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" }, + { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" }, + { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" }, + { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" }, + { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" }, +] + +[[package]] +name = "ckanapi" +version = "4.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docopt" }, + { name = "python-slugify" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "simplejson" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/31/c0131cfe3cdae242699c2889d20016fbe2444dcaf86070ee03863d1035ba/ckanapi-4.8.tar.gz", hash = "sha256:3a98d81e6cb7480883eb1d031740205d3e94176376e9d284d218829d81d0afed", size = 37633, upload_time = "2024-04-04T15:46:09.451Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/ac/626837e55aeb17f8e3982128a25fbf5f7880a397039eb7a1b5cebaca7fa4/ckanapi-4.8-py3-none-any.whl", hash = "sha256:a6ac36b55321368cf39d70f701542276fe098484517e339adf18595f30c076b8", size = 46316, upload_time = "2024-04-04T15:46:07.725Z" }, +] + +[[package]] +name = "click" +version = "8.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload_time = "2024-12-21T18:38:44.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload_time = "2024-12-21T18:38:41.666Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload_time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "data-source-identification" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "aiohttp" }, + { name = "alembic" }, + { name = "apscheduler" }, + { name = "asyncpg" }, + { name = "beautifulsoup4" }, + { name = "bs4" }, + { name = "ckanapi" }, + { name = "datasets" }, + { name = "docker" }, + { name = "fastapi", extra = ["standard"] }, + { name = "from-root" }, + { name = "google-api-python-client" }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "keras" }, + { name = "lxml" }, + { name = "marshmallow" }, + { name = "numpy" }, + { name = "openai" }, + { name = "pandas" }, + { name = "playwright" }, + { name = "psycopg", extra = ["binary"] }, + { name = "psycopg2-binary" }, + { name = "pydantic" }, + { name = "pyjwt" }, + { name = "python-dotenv" }, + { name = "requests" }, + { name = "sqlalchemy" }, + { name = "starlette" }, + { name = "tensorflow-cpu" }, + { name = "tensorflow-io-gcs-filesystem" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "urllib3" }, + { name = "uvicorn" }, +] + +[package.dev-dependencies] +dev = [ + { name = "docker" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "pytest-mock" }, + { name = "pytest-timeout" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiohttp", specifier = "~=3.11.11" }, + { name = "alembic", specifier = "~=1.14.0" }, + { name = "apscheduler", specifier = "~=3.11.0" }, + { name = "asyncpg", specifier = "~=0.30.0" }, + { name = "beautifulsoup4", specifier = ">=4.12.3" }, + { name = "bs4", specifier = "~=0.0.2" }, + { name = "ckanapi", specifier = "~=4.8" }, + { name = "datasets", specifier = "~=2.19.1" }, + { name = "docker", specifier = "~=7.1.0" }, + { name = "fastapi", extras = ["standard"], specifier = "~=0.115.6" }, + { name = "from-root", specifier = "~=1.3.0" }, + { name = "google-api-python-client", specifier = ">=2.156.0" }, + { name = "httpx", specifier = "~=0.28.1" }, + { name = "huggingface-hub", specifier = "~=0.28.1" }, + { name = "keras", specifier = "~=2.15.0" }, + { name = "lxml", specifier = "~=5.1.0" }, + { name = "marshmallow", specifier = "~=3.23.2" }, + { name = "numpy", specifier = "~=1.26.4" }, + { name = "openai", specifier = "~=1.60.1" }, + { name = "pandas", specifier = "~=2.2.3" }, + { name = "playwright", specifier = "~=1.49.1" }, + { name = "psycopg", extras = ["binary"], specifier = "~=3.1.20" }, + { name = "psycopg2-binary", specifier = "~=2.9.6" }, + { name = "pydantic", specifier = "~=2.10.6" }, + { name = "pyjwt", specifier = "~=2.10.1" }, + { name = "python-dotenv", specifier = "~=1.0.1" }, + { name = "requests", specifier = "~=2.32.3" }, + { name = "sqlalchemy", specifier = "~=2.0.36" }, + { name = "starlette", specifier = "~=0.45.3" }, + { name = "tensorflow-cpu", specifier = "~=2.15.1" }, + { name = "tensorflow-io-gcs-filesystem", specifier = "==0.31.0" }, + { name = "tqdm", specifier = ">=4.64.1" }, + { name = "transformers", specifier = "~=4.40.2" }, + { name = "urllib3", specifier = "~=1.26.18" }, + { name = "uvicorn", specifier = "~=0.34.0" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "docker", specifier = ">=7.1.0" }, + { name = "pytest", specifier = ">=7.2.2" }, + { name = "pytest-asyncio", specifier = "~=0.25.2" }, + { name = "pytest-mock", specifier = "==3.12.0" }, + { name = "pytest-timeout", specifier = "~=2.3.1" }, +] + +[[package]] +name = "datasets" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyarrow-hotfix" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/e7/6ee66732f74e4fb1c8915e58b3c253aded777ad0fa457f3f831dd0cd09b4/datasets-2.19.2.tar.gz", hash = "sha256:eccb82fb3bb5ee26ccc6d7a15b7f1f834e2cc4e59b7cff7733a003552bad51ef", size = 2215337, upload_time = "2024-06-03T05:11:44.756Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/59/46818ebeb708234a60e42ccf409d20709e482519d2aa450b501ddbba4594/datasets-2.19.2-py3-none-any.whl", hash = "sha256:e07ff15d75b1af75c87dd96323ba2a361128d495136652f37fd62f918d17bb4e", size = 542113, upload_time = "2024-06-03T05:11:41.151Z" }, +] + +[[package]] +name = "dill" +version = "0.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847, upload_time = "2024-01-27T23:42:16.145Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252, upload_time = "2024-01-27T23:42:14.239Z" }, +] + +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload_time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload_time = "2023-12-24T09:54:30.421Z" }, +] + +[[package]] +name = "dnspython" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/4a/263763cb2ba3816dd94b08ad3a33d5fdae34ecb856678773cc40a3605829/dnspython-2.7.0.tar.gz", hash = "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1", size = 345197, upload_time = "2024-10-05T20:14:59.362Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632, upload_time = "2024-10-05T20:14:57.687Z" }, +] + +[[package]] +name = "docker" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834, upload_time = "2024-05-23T11:13:57.216Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload_time = "2024-05-23T11:13:55.01Z" }, +] + +[[package]] +name = "docopt" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/55/8f8cab2afd404cf578136ef2cc5dfb50baa1761b68c9da1fb1e4eed343c9/docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491", size = 25901, upload_time = "2014-06-16T11:18:57.406Z" } + +[[package]] +name = "email-validator" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dnspython" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/48/ce/13508a1ec3f8bb981ae4ca79ea40384becc868bfae97fd1c942bb3a001b1/email_validator-2.2.0.tar.gz", hash = "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7", size = 48967, upload_time = "2024-06-20T11:30:30.034Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/ee/bf0adb559ad3c786f12bcbc9296b3f5675f529199bef03e2df281fa1fadb/email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631", size = 33521, upload_time = "2024-06-20T11:30:28.248Z" }, +] + +[[package]] +name = "fastapi" +version = "0.115.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/55/ae499352d82338331ca1e28c7f4a63bfd09479b16395dce38cf50a39e2c2/fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681", size = 295236, upload_time = "2025-03-23T22:55:43.822Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/b3/b51f09c2ba432a576fe63758bddc81f78f0c6309d9e5c10d194313bf021e/fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d", size = 95164, upload_time = "2025-03-23T22:55:42.101Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "email-validator" }, + { name = "fastapi-cli", extra = ["standard"] }, + { name = "httpx" }, + { name = "jinja2" }, + { name = "python-multipart" }, + { name = "uvicorn", extra = ["standard"] }, +] + +[[package]] +name = "fastapi-cli" +version = "0.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rich-toolkit" }, + { name = "typer" }, + { name = "uvicorn", extra = ["standard"] }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fe/73/82a5831fbbf8ed75905bacf5b2d9d3dfd6f04d6968b29fe6f72a5ae9ceb1/fastapi_cli-0.0.7.tar.gz", hash = "sha256:02b3b65956f526412515907a0793c9094abd4bfb5457b389f645b0ea6ba3605e", size = 16753, upload_time = "2024-12-15T14:28:10.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/e6/5daefc851b514ce2287d8f5d358ae4341089185f78f3217a69d0ce3a390c/fastapi_cli-0.0.7-py3-none-any.whl", hash = "sha256:d549368ff584b2804336c61f192d86ddea080c11255f375959627911944804f4", size = 10705, upload_time = "2024-12-15T14:28:06.18Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "uvicorn", extra = ["standard"] }, +] + +[[package]] +name = "filelock" +version = "3.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075, upload_time = "2025-03-14T07:11:40.47Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload_time = "2025-03-14T07:11:39.145Z" }, +] + +[[package]] +name = "flatbuffers" +version = "25.2.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/30/eb5dce7994fc71a2f685d98ec33cc660c0a5887db5610137e60d8cbc4489/flatbuffers-25.2.10.tar.gz", hash = "sha256:97e451377a41262f8d9bd4295cc836133415cc03d8cb966410a4af92eb00d26e", size = 22170, upload_time = "2025-02-11T04:26:46.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/25/155f9f080d5e4bc0082edfda032ea2bc2b8fab3f4d25d46c1e9dd22a1a89/flatbuffers-25.2.10-py2.py3-none-any.whl", hash = "sha256:ebba5f4d5ea615af3f7fd70fc310636fbb2bbd1f566ac0a23d98dd412de50051", size = 30953, upload_time = "2025-02-11T04:26:44.484Z" }, +] + +[[package]] +name = "from-root" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/30/5259cfafc8372df008a5605ca19aba9d560285471ee043f39cbc5a7b7fa2/from_root-1.3.0.tar.gz", hash = "sha256:da1359f5faabca367f685cac927cb2f307bb35c488fdd0361f963d6f1cd2674f", size = 4858, upload_time = "2022-12-27T12:41:25.78Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/a8/451d0294d5d9ead3d26c25837df0588d1bcdd9235abf91e0ded629369921/from_root-1.3.0-py3-none-any.whl", hash = "sha256:7446a9b6481e668329cc11ad0a234fe4c83c63468c652e037d02846a75c726f8", size = 5489, upload_time = "2022-12-27T12:41:23.989Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/f4/d744cba2da59b5c1d88823cf9e8a6c74e4659e2b27604ed973be2a0bf5ab/frozenlist-1.6.0.tar.gz", hash = "sha256:b99655c32c1c8e06d111e7f41c06c29a5318cb1835df23a45518e02a47c63b68", size = 42831, upload_time = "2025-04-17T22:38:53.099Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/b5/bc883b5296ec902115c00be161da93bf661199c465ec4c483feec6ea4c32/frozenlist-1.6.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae8337990e7a45683548ffb2fee1af2f1ed08169284cd829cdd9a7fa7470530d", size = 160912, upload_time = "2025-04-17T22:36:17.235Z" }, + { url = "https://files.pythonhosted.org/packages/6f/93/51b058b563d0704b39c56baa222828043aafcac17fd3734bec5dbeb619b1/frozenlist-1.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8c952f69dd524558694818a461855f35d36cc7f5c0adddce37e962c85d06eac0", size = 124315, upload_time = "2025-04-17T22:36:18.735Z" }, + { url = "https://files.pythonhosted.org/packages/c9/e0/46cd35219428d350558b874d595e132d1c17a9471a1bd0d01d518a261e7c/frozenlist-1.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f5fef13136c4e2dee91bfb9a44e236fff78fc2cd9f838eddfc470c3d7d90afe", size = 122230, upload_time = "2025-04-17T22:36:20.6Z" }, + { url = "https://files.pythonhosted.org/packages/d1/0f/7ad2ce928ad06d6dd26a61812b959ded573d3e9d0ee6109d96c2be7172e9/frozenlist-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:716bbba09611b4663ecbb7cd022f640759af8259e12a6ca939c0a6acd49eedba", size = 314842, upload_time = "2025-04-17T22:36:22.088Z" }, + { url = "https://files.pythonhosted.org/packages/34/76/98cbbd8a20a5c3359a2004ae5e5b216af84a150ccbad67c8f8f30fb2ea91/frozenlist-1.6.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7b8c4dc422c1a3ffc550b465090e53b0bf4839047f3e436a34172ac67c45d595", size = 304919, upload_time = "2025-04-17T22:36:24.247Z" }, + { url = "https://files.pythonhosted.org/packages/9a/fa/258e771ce3a44348c05e6b01dffc2bc67603fba95761458c238cd09a2c77/frozenlist-1.6.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b11534872256e1666116f6587a1592ef395a98b54476addb5e8d352925cb5d4a", size = 324074, upload_time = "2025-04-17T22:36:26.291Z" }, + { url = "https://files.pythonhosted.org/packages/d5/a4/047d861fd8c538210e12b208c0479912273f991356b6bdee7ea8356b07c9/frozenlist-1.6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c6eceb88aaf7221f75be6ab498dc622a151f5f88d536661af3ffc486245a626", size = 321292, upload_time = "2025-04-17T22:36:27.909Z" }, + { url = "https://files.pythonhosted.org/packages/c0/25/cfec8af758b4525676cabd36efcaf7102c1348a776c0d1ad046b8a7cdc65/frozenlist-1.6.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62c828a5b195570eb4b37369fcbbd58e96c905768d53a44d13044355647838ff", size = 301569, upload_time = "2025-04-17T22:36:29.448Z" }, + { url = "https://files.pythonhosted.org/packages/87/2f/0c819372fa9f0c07b153124bf58683b8d0ca7bb73ea5ccde9b9ef1745beb/frozenlist-1.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1c6bd2c6399920c9622362ce95a7d74e7f9af9bfec05fff91b8ce4b9647845a", size = 313625, upload_time = "2025-04-17T22:36:31.55Z" }, + { url = "https://files.pythonhosted.org/packages/50/5f/f0cf8b0fdedffdb76b3745aa13d5dbe404d63493cc211ce8250f2025307f/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:49ba23817781e22fcbd45fd9ff2b9b8cdb7b16a42a4851ab8025cae7b22e96d0", size = 312523, upload_time = "2025-04-17T22:36:33.078Z" }, + { url = "https://files.pythonhosted.org/packages/e1/6c/38c49108491272d3e84125bbabf2c2d0b304899b52f49f0539deb26ad18d/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:431ef6937ae0f853143e2ca67d6da76c083e8b1fe3df0e96f3802fd37626e606", size = 322657, upload_time = "2025-04-17T22:36:34.688Z" }, + { url = "https://files.pythonhosted.org/packages/bd/4b/3bd3bad5be06a9d1b04b1c22be80b5fe65b502992d62fab4bdb25d9366ee/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9d124b38b3c299ca68433597ee26b7819209cb8a3a9ea761dfe9db3a04bba584", size = 303414, upload_time = "2025-04-17T22:36:36.363Z" }, + { url = "https://files.pythonhosted.org/packages/5b/89/7e225a30bef6e85dbfe22622c24afe932e9444de3b40d58b1ea589a14ef8/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:118e97556306402e2b010da1ef21ea70cb6d6122e580da64c056b96f524fbd6a", size = 320321, upload_time = "2025-04-17T22:36:38.16Z" }, + { url = "https://files.pythonhosted.org/packages/22/72/7e3acef4dd9e86366cb8f4d8f28e852c2b7e116927e9722b31a6f71ea4b0/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:fb3b309f1d4086b5533cf7bbcf3f956f0ae6469664522f1bde4feed26fba60f1", size = 323975, upload_time = "2025-04-17T22:36:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/d8/85/e5da03d20507e13c66ce612c9792b76811b7a43e3320cce42d95b85ac755/frozenlist-1.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54dece0d21dce4fdb188a1ffc555926adf1d1c516e493c2914d7c370e454bc9e", size = 316553, upload_time = "2025-04-17T22:36:42.045Z" }, + { url = "https://files.pythonhosted.org/packages/ac/8e/6c609cbd0580ae8a0661c408149f196aade7d325b1ae7adc930501b81acb/frozenlist-1.6.0-cp311-cp311-win32.whl", hash = "sha256:654e4ba1d0b2154ca2f096bed27461cf6160bc7f504a7f9a9ef447c293caf860", size = 115511, upload_time = "2025-04-17T22:36:44.067Z" }, + { url = "https://files.pythonhosted.org/packages/f2/13/a84804cfde6de12d44ed48ecbf777ba62b12ff09e761f76cdd1ff9e14bb1/frozenlist-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e911391bffdb806001002c1f860787542f45916c3baf764264a52765d5a5603", size = 120863, upload_time = "2025-04-17T22:36:45.465Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8a/289b7d0de2fbac832ea80944d809759976f661557a38bb8e77db5d9f79b7/frozenlist-1.6.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c5b9e42ace7d95bf41e19b87cec8f262c41d3510d8ad7514ab3862ea2197bfb1", size = 160193, upload_time = "2025-04-17T22:36:47.382Z" }, + { url = "https://files.pythonhosted.org/packages/19/80/2fd17d322aec7f430549f0669f599997174f93ee17929ea5b92781ec902c/frozenlist-1.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ca9973735ce9f770d24d5484dcb42f68f135351c2fc81a7a9369e48cf2998a29", size = 123831, upload_time = "2025-04-17T22:36:49.401Z" }, + { url = "https://files.pythonhosted.org/packages/99/06/f5812da431273f78c6543e0b2f7de67dfd65eb0a433978b2c9c63d2205e4/frozenlist-1.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6ac40ec76041c67b928ca8aaffba15c2b2ee3f5ae8d0cb0617b5e63ec119ca25", size = 121862, upload_time = "2025-04-17T22:36:51.899Z" }, + { url = "https://files.pythonhosted.org/packages/d0/31/9e61c6b5fc493cf24d54881731204d27105234d09878be1a5983182cc4a5/frozenlist-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95b7a8a3180dfb280eb044fdec562f9b461614c0ef21669aea6f1d3dac6ee576", size = 316361, upload_time = "2025-04-17T22:36:53.402Z" }, + { url = "https://files.pythonhosted.org/packages/9d/55/22ca9362d4f0222324981470fd50192be200154d51509ee6eb9baa148e96/frozenlist-1.6.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c444d824e22da6c9291886d80c7d00c444981a72686e2b59d38b285617cb52c8", size = 307115, upload_time = "2025-04-17T22:36:55.016Z" }, + { url = "https://files.pythonhosted.org/packages/ae/39/4fff42920a57794881e7bb3898dc7f5f539261711ea411b43bba3cde8b79/frozenlist-1.6.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb52c8166499a8150bfd38478248572c924c003cbb45fe3bcd348e5ac7c000f9", size = 322505, upload_time = "2025-04-17T22:36:57.12Z" }, + { url = "https://files.pythonhosted.org/packages/55/f2/88c41f374c1e4cf0092a5459e5f3d6a1e17ed274c98087a76487783df90c/frozenlist-1.6.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b35298b2db9c2468106278537ee529719228950a5fdda686582f68f247d1dc6e", size = 322666, upload_time = "2025-04-17T22:36:58.735Z" }, + { url = "https://files.pythonhosted.org/packages/75/51/034eeb75afdf3fd03997856195b500722c0b1a50716664cde64e28299c4b/frozenlist-1.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d108e2d070034f9d57210f22fefd22ea0d04609fc97c5f7f5a686b3471028590", size = 302119, upload_time = "2025-04-17T22:37:00.512Z" }, + { url = "https://files.pythonhosted.org/packages/2b/a6/564ecde55ee633270a793999ef4fd1d2c2b32b5a7eec903b1012cb7c5143/frozenlist-1.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e1be9111cb6756868ac242b3c2bd1f09d9aea09846e4f5c23715e7afb647103", size = 316226, upload_time = "2025-04-17T22:37:02.102Z" }, + { url = "https://files.pythonhosted.org/packages/f1/c8/6c0682c32377f402b8a6174fb16378b683cf6379ab4d2827c580892ab3c7/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:94bb451c664415f02f07eef4ece976a2c65dcbab9c2f1705b7031a3a75349d8c", size = 312788, upload_time = "2025-04-17T22:37:03.578Z" }, + { url = "https://files.pythonhosted.org/packages/b6/b8/10fbec38f82c5d163ca1750bfff4ede69713badf236a016781cf1f10a0f0/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:d1a686d0b0949182b8faddea596f3fc11f44768d1f74d4cad70213b2e139d821", size = 325914, upload_time = "2025-04-17T22:37:05.213Z" }, + { url = "https://files.pythonhosted.org/packages/62/ca/2bf4f3a1bd40cdedd301e6ecfdbb291080d5afc5f9ce350c0739f773d6b9/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ea8e59105d802c5a38bdbe7362822c522230b3faba2aa35c0fa1765239b7dd70", size = 305283, upload_time = "2025-04-17T22:37:06.985Z" }, + { url = "https://files.pythonhosted.org/packages/09/64/20cc13ccf94abc2a1f482f74ad210703dc78a590d0b805af1c9aa67f76f9/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:abc4e880a9b920bc5020bf6a431a6bb40589d9bca3975c980495f63632e8382f", size = 319264, upload_time = "2025-04-17T22:37:08.618Z" }, + { url = "https://files.pythonhosted.org/packages/20/ff/86c6a2bbe98cfc231519f5e6d712a0898488ceac804a917ce014f32e68f6/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9a79713adfe28830f27a3c62f6b5406c37376c892b05ae070906f07ae4487046", size = 326482, upload_time = "2025-04-17T22:37:10.196Z" }, + { url = "https://files.pythonhosted.org/packages/2f/da/8e381f66367d79adca245d1d71527aac774e30e291d41ef161ce2d80c38e/frozenlist-1.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a0318c2068e217a8f5e3b85e35899f5a19e97141a45bb925bb357cfe1daf770", size = 318248, upload_time = "2025-04-17T22:37:12.284Z" }, + { url = "https://files.pythonhosted.org/packages/39/24/1a1976563fb476ab6f0fa9fefaac7616a4361dbe0461324f9fd7bf425dbe/frozenlist-1.6.0-cp312-cp312-win32.whl", hash = "sha256:853ac025092a24bb3bf09ae87f9127de9fe6e0c345614ac92536577cf956dfcc", size = 115161, upload_time = "2025-04-17T22:37:13.902Z" }, + { url = "https://files.pythonhosted.org/packages/80/2e/fb4ed62a65f8cd66044706b1013f0010930d8cbb0729a2219561ea075434/frozenlist-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:2bdfe2d7e6c9281c6e55523acd6c2bf77963cb422fdc7d142fb0cb6621b66878", size = 120548, upload_time = "2025-04-17T22:37:15.326Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e5/04c7090c514d96ca00887932417f04343ab94904a56ab7f57861bf63652d/frozenlist-1.6.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1d7fb014fe0fbfee3efd6a94fc635aeaa68e5e1720fe9e57357f2e2c6e1a647e", size = 158182, upload_time = "2025-04-17T22:37:16.837Z" }, + { url = "https://files.pythonhosted.org/packages/e9/8f/60d0555c61eec855783a6356268314d204137f5e0c53b59ae2fc28938c99/frozenlist-1.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01bcaa305a0fdad12745502bfd16a1c75b14558dabae226852f9159364573117", size = 122838, upload_time = "2025-04-17T22:37:18.352Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a7/d0ec890e3665b4b3b7c05dc80e477ed8dc2e2e77719368e78e2cd9fec9c8/frozenlist-1.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8b314faa3051a6d45da196a2c495e922f987dc848e967d8cfeaee8a0328b1cd4", size = 120980, upload_time = "2025-04-17T22:37:19.857Z" }, + { url = "https://files.pythonhosted.org/packages/cc/19/9b355a5e7a8eba903a008579964192c3e427444752f20b2144b10bb336df/frozenlist-1.6.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da62fecac21a3ee10463d153549d8db87549a5e77eefb8c91ac84bb42bb1e4e3", size = 305463, upload_time = "2025-04-17T22:37:21.328Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8d/5b4c758c2550131d66935ef2fa700ada2461c08866aef4229ae1554b93ca/frozenlist-1.6.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d1eb89bf3454e2132e046f9599fbcf0a4483ed43b40f545551a39316d0201cd1", size = 297985, upload_time = "2025-04-17T22:37:23.55Z" }, + { url = "https://files.pythonhosted.org/packages/48/2c/537ec09e032b5865715726b2d1d9813e6589b571d34d01550c7aeaad7e53/frozenlist-1.6.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18689b40cb3936acd971f663ccb8e2589c45db5e2c5f07e0ec6207664029a9c", size = 311188, upload_time = "2025-04-17T22:37:25.221Z" }, + { url = "https://files.pythonhosted.org/packages/31/2f/1aa74b33f74d54817055de9a4961eff798f066cdc6f67591905d4fc82a84/frozenlist-1.6.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e67ddb0749ed066b1a03fba812e2dcae791dd50e5da03be50b6a14d0c1a9ee45", size = 311874, upload_time = "2025-04-17T22:37:26.791Z" }, + { url = "https://files.pythonhosted.org/packages/bf/f0/cfec18838f13ebf4b37cfebc8649db5ea71a1b25dacd691444a10729776c/frozenlist-1.6.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc5e64626e6682638d6e44398c9baf1d6ce6bc236d40b4b57255c9d3f9761f1f", size = 291897, upload_time = "2025-04-17T22:37:28.958Z" }, + { url = "https://files.pythonhosted.org/packages/ea/a5/deb39325cbbea6cd0a46db8ccd76150ae2fcbe60d63243d9df4a0b8c3205/frozenlist-1.6.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:437cfd39564744ae32ad5929e55b18ebd88817f9180e4cc05e7d53b75f79ce85", size = 305799, upload_time = "2025-04-17T22:37:30.889Z" }, + { url = "https://files.pythonhosted.org/packages/78/22/6ddec55c5243a59f605e4280f10cee8c95a449f81e40117163383829c241/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:62dd7df78e74d924952e2feb7357d826af8d2f307557a779d14ddf94d7311be8", size = 302804, upload_time = "2025-04-17T22:37:32.489Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b7/d9ca9bab87f28855063c4d202936800219e39db9e46f9fb004d521152623/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:a66781d7e4cddcbbcfd64de3d41a61d6bdde370fc2e38623f30b2bd539e84a9f", size = 316404, upload_time = "2025-04-17T22:37:34.59Z" }, + { url = "https://files.pythonhosted.org/packages/a6/3a/1255305db7874d0b9eddb4fe4a27469e1fb63720f1fc6d325a5118492d18/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:482fe06e9a3fffbcd41950f9d890034b4a54395c60b5e61fae875d37a699813f", size = 295572, upload_time = "2025-04-17T22:37:36.337Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f2/8d38eeee39a0e3a91b75867cc102159ecccf441deb6ddf67be96d3410b84/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e4f9373c500dfc02feea39f7a56e4f543e670212102cc2eeb51d3a99c7ffbde6", size = 307601, upload_time = "2025-04-17T22:37:37.923Z" }, + { url = "https://files.pythonhosted.org/packages/38/04/80ec8e6b92f61ef085422d7b196822820404f940950dde5b2e367bede8bc/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e69bb81de06827147b7bfbaeb284d85219fa92d9f097e32cc73675f279d70188", size = 314232, upload_time = "2025-04-17T22:37:39.669Z" }, + { url = "https://files.pythonhosted.org/packages/3a/58/93b41fb23e75f38f453ae92a2f987274c64637c450285577bd81c599b715/frozenlist-1.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7613d9977d2ab4a9141dde4a149f4357e4065949674c5649f920fec86ecb393e", size = 308187, upload_time = "2025-04-17T22:37:41.662Z" }, + { url = "https://files.pythonhosted.org/packages/6a/a2/e64df5c5aa36ab3dee5a40d254f3e471bb0603c225f81664267281c46a2d/frozenlist-1.6.0-cp313-cp313-win32.whl", hash = "sha256:4def87ef6d90429f777c9d9de3961679abf938cb6b7b63d4a7eb8a268babfce4", size = 114772, upload_time = "2025-04-17T22:37:43.132Z" }, + { url = "https://files.pythonhosted.org/packages/a0/77/fead27441e749b2d574bb73d693530d59d520d4b9e9679b8e3cb779d37f2/frozenlist-1.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:37a8a52c3dfff01515e9bbbee0e6063181362f9de3db2ccf9bc96189b557cbfd", size = 119847, upload_time = "2025-04-17T22:37:45.118Z" }, + { url = "https://files.pythonhosted.org/packages/df/bd/cc6d934991c1e5d9cafda83dfdc52f987c7b28343686aef2e58a9cf89f20/frozenlist-1.6.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:46138f5a0773d064ff663d273b309b696293d7a7c00a0994c5c13a5078134b64", size = 174937, upload_time = "2025-04-17T22:37:46.635Z" }, + { url = "https://files.pythonhosted.org/packages/f2/a2/daf945f335abdbfdd5993e9dc348ef4507436936ab3c26d7cfe72f4843bf/frozenlist-1.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:f88bc0a2b9c2a835cb888b32246c27cdab5740059fb3688852bf91e915399b91", size = 136029, upload_time = "2025-04-17T22:37:48.192Z" }, + { url = "https://files.pythonhosted.org/packages/51/65/4c3145f237a31247c3429e1c94c384d053f69b52110a0d04bfc8afc55fb2/frozenlist-1.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:777704c1d7655b802c7850255639672e90e81ad6fa42b99ce5ed3fbf45e338dd", size = 134831, upload_time = "2025-04-17T22:37:50.485Z" }, + { url = "https://files.pythonhosted.org/packages/77/38/03d316507d8dea84dfb99bdd515ea245628af964b2bf57759e3c9205cc5e/frozenlist-1.6.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85ef8d41764c7de0dcdaf64f733a27352248493a85a80661f3c678acd27e31f2", size = 392981, upload_time = "2025-04-17T22:37:52.558Z" }, + { url = "https://files.pythonhosted.org/packages/37/02/46285ef9828f318ba400a51d5bb616ded38db8466836a9cfa39f3903260b/frozenlist-1.6.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:da5cb36623f2b846fb25009d9d9215322318ff1c63403075f812b3b2876c8506", size = 371999, upload_time = "2025-04-17T22:37:54.092Z" }, + { url = "https://files.pythonhosted.org/packages/0d/64/1212fea37a112c3c5c05bfb5f0a81af4836ce349e69be75af93f99644da9/frozenlist-1.6.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cbb56587a16cf0fb8acd19e90ff9924979ac1431baea8681712716a8337577b0", size = 392200, upload_time = "2025-04-17T22:37:55.951Z" }, + { url = "https://files.pythonhosted.org/packages/81/ce/9a6ea1763e3366e44a5208f76bf37c76c5da570772375e4d0be85180e588/frozenlist-1.6.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6154c3ba59cda3f954c6333025369e42c3acd0c6e8b6ce31eb5c5b8116c07e0", size = 390134, upload_time = "2025-04-17T22:37:57.633Z" }, + { url = "https://files.pythonhosted.org/packages/bc/36/939738b0b495b2c6d0c39ba51563e453232813042a8d908b8f9544296c29/frozenlist-1.6.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e8246877afa3f1ae5c979fe85f567d220f86a50dc6c493b9b7d8191181ae01e", size = 365208, upload_time = "2025-04-17T22:37:59.742Z" }, + { url = "https://files.pythonhosted.org/packages/b4/8b/939e62e93c63409949c25220d1ba8e88e3960f8ef6a8d9ede8f94b459d27/frozenlist-1.6.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b0f6cce16306d2e117cf9db71ab3a9e8878a28176aeaf0dbe35248d97b28d0c", size = 385548, upload_time = "2025-04-17T22:38:01.416Z" }, + { url = "https://files.pythonhosted.org/packages/62/38/22d2873c90102e06a7c5a3a5b82ca47e393c6079413e8a75c72bff067fa8/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1b8e8cd8032ba266f91136d7105706ad57770f3522eac4a111d77ac126a25a9b", size = 391123, upload_time = "2025-04-17T22:38:03.049Z" }, + { url = "https://files.pythonhosted.org/packages/44/78/63aaaf533ee0701549500f6d819be092c6065cb5c577edb70c09df74d5d0/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:e2ada1d8515d3ea5378c018a5f6d14b4994d4036591a52ceaf1a1549dec8e1ad", size = 394199, upload_time = "2025-04-17T22:38:04.776Z" }, + { url = "https://files.pythonhosted.org/packages/54/45/71a6b48981d429e8fbcc08454dc99c4c2639865a646d549812883e9c9dd3/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:cdb2c7f071e4026c19a3e32b93a09e59b12000751fc9b0b7758da899e657d215", size = 373854, upload_time = "2025-04-17T22:38:06.576Z" }, + { url = "https://files.pythonhosted.org/packages/3f/f3/dbf2a5e11736ea81a66e37288bf9f881143a7822b288a992579ba1b4204d/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:03572933a1969a6d6ab509d509e5af82ef80d4a5d4e1e9f2e1cdd22c77a3f4d2", size = 395412, upload_time = "2025-04-17T22:38:08.197Z" }, + { url = "https://files.pythonhosted.org/packages/b3/f1/c63166806b331f05104d8ea385c4acd511598568b1f3e4e8297ca54f2676/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:77effc978947548b676c54bbd6a08992759ea6f410d4987d69feea9cd0919911", size = 394936, upload_time = "2025-04-17T22:38:10.056Z" }, + { url = "https://files.pythonhosted.org/packages/ef/ea/4f3e69e179a430473eaa1a75ff986526571215fefc6b9281cdc1f09a4eb8/frozenlist-1.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a2bda8be77660ad4089caf2223fdbd6db1858462c4b85b67fbfa22102021e497", size = 391459, upload_time = "2025-04-17T22:38:11.826Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c3/0fc2c97dea550df9afd072a37c1e95421652e3206bbeaa02378b24c2b480/frozenlist-1.6.0-cp313-cp313t-win32.whl", hash = "sha256:a4d96dc5bcdbd834ec6b0f91027817214216b5b30316494d2b1aebffb87c534f", size = 128797, upload_time = "2025-04-17T22:38:14.013Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f5/79c9320c5656b1965634fe4be9c82b12a3305bdbc58ad9cb941131107b20/frozenlist-1.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e18036cb4caa17ea151fd5f3d70be9d354c99eb8cf817a3ccde8a7873b074348", size = 134709, upload_time = "2025-04-17T22:38:15.551Z" }, + { url = "https://files.pythonhosted.org/packages/71/3e/b04a0adda73bd52b390d730071c0d577073d3d26740ee1bad25c3ad0f37b/frozenlist-1.6.0-py3-none-any.whl", hash = "sha256:535eec9987adb04701266b92745d6cdcef2e77669299359c3009c3404dd5d191", size = 12404, upload_time = "2025-04-17T22:38:51.668Z" }, +] + +[[package]] +name = "fsspec" +version = "2024.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/b8/e3ba21f03c00c27adc9a8cd1cab8adfb37b6024757133924a9a4eab63a83/fsspec-2024.3.1.tar.gz", hash = "sha256:f39780e282d7d117ffb42bb96992f8a90795e4d0fb0f661a70ca39fe9c43ded9", size = 170742, upload_time = "2024-03-18T19:35:13.995Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/6d/66d48b03460768f523da62a57a7e14e5e95fdf339d79e996ce3cecda2cdb/fsspec-2024.3.1-py3-none-any.whl", hash = "sha256:918d18d41bf73f0e2b261824baeb1b124bcf771767e3a26425cd7dec3332f512", size = 171991, upload_time = "2024-03-18T19:35:11.259Z" }, +] + +[package.optional-dependencies] +http = [ + { name = "aiohttp" }, +] + +[[package]] +name = "gast" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3c/14/c566f5ca00c115db7725263408ff952b8ae6d6a4e792ef9c84e77d9af7a1/gast-0.6.0.tar.gz", hash = "sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb", size = 27708, upload_time = "2024-06-27T20:31:49.527Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/61/8001b38461d751cd1a0c3a6ae84346796a5758123f3ed97a1b121dfbf4f3/gast-0.6.0-py3-none-any.whl", hash = "sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54", size = 21173, upload_time = "2024-07-09T13:15:15.615Z" }, +] + +[[package]] +name = "google-api-core" +version = "2.24.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/5c/085bcb872556934bb119e5e09de54daa07873f6866b8f0303c49e72287f7/google_api_core-2.24.2.tar.gz", hash = "sha256:81718493daf06d96d6bc76a91c23874dbf2fac0adbbf542831b805ee6e974696", size = 163516, upload_time = "2025-03-10T15:55:26.201Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/95/f472d85adab6e538da2025dfca9e976a0d125cc0af2301f190e77b76e51c/google_api_core-2.24.2-py3-none-any.whl", hash = "sha256:810a63ac95f3c441b7c0e43d344e372887f62ce9071ba972eacf32672e072de9", size = 160061, upload_time = "2025-03-10T15:55:24.386Z" }, +] + +[[package]] +name = "google-api-python-client" +version = "2.169.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, + { name = "google-auth-httplib2" }, + { name = "httplib2" }, + { name = "uritemplate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4f/e6/787c24738fc7c99de9289abe60bd64591800ae1cdf60db7b87e0e6ef9cdd/google_api_python_client-2.169.0.tar.gz", hash = "sha256:0585bb97bd5f5bf3ed8d4bf624593e4c5a14d06c811d1952b07a1f94b4d12c51", size = 12811341, upload_time = "2025-04-29T15:46:05.603Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/bd/6aa93c38756cc9fc63262e0dc3d3f1ff7241ce6f413a25ad6e4a9c98b473/google_api_python_client-2.169.0-py3-none-any.whl", hash = "sha256:dae3e882dc0e6f28e60cf09c1f13fedfd881db84f824dd418aa9e44def2fe00d", size = 13323742, upload_time = "2025-04-29T15:46:02.521Z" }, +] + +[[package]] +name = "google-auth" +version = "2.40.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/a5/38c21d0e731bb716cffcf987bd9a3555cb95877ab4b616cfb96939933f20/google_auth-2.40.1.tar.gz", hash = "sha256:58f0e8416a9814c1d86c9b7f6acf6816b51aba167b2c76821965271bac275540", size = 280975, upload_time = "2025-05-07T01:04:55.3Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/b1/1272c6e80847ba5349f5ccb7574596393d1e222543f5003cb810865c3575/google_auth-2.40.1-py2.py3-none-any.whl", hash = "sha256:ed4cae4f5c46b41bae1d19c036e06f6c371926e97b19e816fc854eff811974ee", size = 216101, upload_time = "2025-05-07T01:04:53.612Z" }, +] + +[[package]] +name = "google-auth-httplib2" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "httplib2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/be/217a598a818567b28e859ff087f347475c807a5649296fb5a817c58dacef/google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05", size = 10842, upload_time = "2023-12-12T17:40:30.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/8a/fe34d2f3f9470a27b01c9e76226965863f153d5fbe276f83608562e49c04/google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d", size = 9253, upload_time = "2023-12-12T17:40:13.055Z" }, +] + +[[package]] +name = "google-auth-oauthlib" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "requests-oauthlib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload_time = "2025-04-22T16:40:29.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload_time = "2025-04-22T16:40:28.174Z" }, +] + +[[package]] +name = "google-pasta" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/4a/0bd53b36ff0323d10d5f24ebd67af2de10a1117f5cf4d7add90df92756f1/google-pasta-0.2.0.tar.gz", hash = "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e", size = 40430, upload_time = "2020-03-13T18:57:50.34Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/de/c648ef6835192e6e2cc03f40b19eeda4382c49b5bafb43d88b931c4c74ac/google_pasta-0.2.0-py3-none-any.whl", hash = "sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed", size = 57471, upload_time = "2020-03-13T18:57:48.872Z" }, +] + +[[package]] +name = "googleapis-common-protos" +version = "1.70.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload_time = "2025-04-14T10:17:02.924Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload_time = "2025-04-14T10:17:01.271Z" }, +] + +[[package]] +name = "greenlet" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/ff/df5fede753cc10f6a5be0931204ea30c35fa2f2ea7a35b25bdaf4fe40e46/greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467", size = 186022, upload_time = "2024-09-20T18:21:04.506Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/62/1c2665558618553c42922ed47a4e6d6527e2fa3516a8256c2f431c5d0441/greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70", size = 272479, upload_time = "2024-09-20T17:07:22.332Z" }, + { url = "https://files.pythonhosted.org/packages/76/9d/421e2d5f07285b6e4e3a676b016ca781f63cfe4a0cd8eaecf3fd6f7a71ae/greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159", size = 640404, upload_time = "2024-09-20T17:36:45.588Z" }, + { url = "https://files.pythonhosted.org/packages/e5/de/6e05f5c59262a584e502dd3d261bbdd2c97ab5416cc9c0b91ea38932a901/greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e", size = 652813, upload_time = "2024-09-20T17:39:19.052Z" }, + { url = "https://files.pythonhosted.org/packages/49/93/d5f93c84241acdea15a8fd329362c2c71c79e1a507c3f142a5d67ea435ae/greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1", size = 648517, upload_time = "2024-09-20T17:44:24.101Z" }, + { url = "https://files.pythonhosted.org/packages/15/85/72f77fc02d00470c86a5c982b8daafdf65d38aefbbe441cebff3bf7037fc/greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383", size = 647831, upload_time = "2024-09-20T17:08:40.577Z" }, + { url = "https://files.pythonhosted.org/packages/f7/4b/1c9695aa24f808e156c8f4813f685d975ca73c000c2a5056c514c64980f6/greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a", size = 602413, upload_time = "2024-09-20T17:08:31.728Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/ad6e5b31ef330f03b12559d19fda2606a522d3849cde46b24f223d6d1619/greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511", size = 1129619, upload_time = "2024-09-20T17:44:14.222Z" }, + { url = "https://files.pythonhosted.org/packages/f4/fb/201e1b932e584066e0f0658b538e73c459b34d44b4bd4034f682423bc801/greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395", size = 1155198, upload_time = "2024-09-20T17:09:23.903Z" }, + { url = "https://files.pythonhosted.org/packages/12/da/b9ed5e310bb8b89661b80cbcd4db5a067903bbcd7fc854923f5ebb4144f0/greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39", size = 298930, upload_time = "2024-09-20T17:25:18.656Z" }, + { url = "https://files.pythonhosted.org/packages/7d/ec/bad1ac26764d26aa1353216fcbfa4670050f66d445448aafa227f8b16e80/greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d", size = 274260, upload_time = "2024-09-20T17:08:07.301Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/c8c04958870f482459ab5956c2942c4ec35cac7fe245527f1039837c17a9/greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79", size = 649064, upload_time = "2024-09-20T17:36:47.628Z" }, + { url = "https://files.pythonhosted.org/packages/51/41/467b12a8c7c1303d20abcca145db2be4e6cd50a951fa30af48b6ec607581/greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa", size = 663420, upload_time = "2024-09-20T17:39:21.258Z" }, + { url = "https://files.pythonhosted.org/packages/27/8f/2a93cd9b1e7107d5c7b3b7816eeadcac2ebcaf6d6513df9abaf0334777f6/greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441", size = 658035, upload_time = "2024-09-20T17:44:26.501Z" }, + { url = "https://files.pythonhosted.org/packages/57/5c/7c6f50cb12be092e1dccb2599be5a942c3416dbcfb76efcf54b3f8be4d8d/greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36", size = 660105, upload_time = "2024-09-20T17:08:42.048Z" }, + { url = "https://files.pythonhosted.org/packages/f1/66/033e58a50fd9ec9df00a8671c74f1f3a320564c6415a4ed82a1c651654ba/greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9", size = 613077, upload_time = "2024-09-20T17:08:33.707Z" }, + { url = "https://files.pythonhosted.org/packages/19/c5/36384a06f748044d06bdd8776e231fadf92fc896bd12cb1c9f5a1bda9578/greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0", size = 1135975, upload_time = "2024-09-20T17:44:15.989Z" }, + { url = "https://files.pythonhosted.org/packages/38/f9/c0a0eb61bdf808d23266ecf1d63309f0e1471f284300ce6dac0ae1231881/greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942", size = 1163955, upload_time = "2024-09-20T17:09:25.539Z" }, + { url = "https://files.pythonhosted.org/packages/43/21/a5d9df1d21514883333fc86584c07c2b49ba7c602e670b174bd73cfc9c7f/greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01", size = 299655, upload_time = "2024-09-20T17:21:22.427Z" }, + { url = "https://files.pythonhosted.org/packages/f3/57/0db4940cd7bb461365ca8d6fd53e68254c9dbbcc2b452e69d0d41f10a85e/greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1", size = 272990, upload_time = "2024-09-20T17:08:26.312Z" }, + { url = "https://files.pythonhosted.org/packages/1c/ec/423d113c9f74e5e402e175b157203e9102feeb7088cee844d735b28ef963/greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff", size = 649175, upload_time = "2024-09-20T17:36:48.983Z" }, + { url = "https://files.pythonhosted.org/packages/a9/46/ddbd2db9ff209186b7b7c621d1432e2f21714adc988703dbdd0e65155c77/greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a", size = 663425, upload_time = "2024-09-20T17:39:22.705Z" }, + { url = "https://files.pythonhosted.org/packages/bc/f9/9c82d6b2b04aa37e38e74f0c429aece5eeb02bab6e3b98e7db89b23d94c6/greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e", size = 657736, upload_time = "2024-09-20T17:44:28.544Z" }, + { url = "https://files.pythonhosted.org/packages/d9/42/b87bc2a81e3a62c3de2b0d550bf91a86939442b7ff85abb94eec3fc0e6aa/greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4", size = 660347, upload_time = "2024-09-20T17:08:45.56Z" }, + { url = "https://files.pythonhosted.org/packages/37/fa/71599c3fd06336cdc3eac52e6871cfebab4d9d70674a9a9e7a482c318e99/greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e", size = 615583, upload_time = "2024-09-20T17:08:36.85Z" }, + { url = "https://files.pythonhosted.org/packages/4e/96/e9ef85de031703ee7a4483489b40cf307f93c1824a02e903106f2ea315fe/greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1", size = 1133039, upload_time = "2024-09-20T17:44:18.287Z" }, + { url = "https://files.pythonhosted.org/packages/87/76/b2b6362accd69f2d1889db61a18c94bc743e961e3cab344c2effaa4b4a25/greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c", size = 1160716, upload_time = "2024-09-20T17:09:27.112Z" }, + { url = "https://files.pythonhosted.org/packages/1f/1b/54336d876186920e185066d8c3024ad55f21d7cc3683c856127ddb7b13ce/greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761", size = 299490, upload_time = "2024-09-20T17:17:09.501Z" }, + { url = "https://files.pythonhosted.org/packages/5f/17/bea55bf36990e1638a2af5ba10c1640273ef20f627962cf97107f1e5d637/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011", size = 643731, upload_time = "2024-09-20T17:36:50.376Z" }, + { url = "https://files.pythonhosted.org/packages/78/d2/aa3d2157f9ab742a08e0fd8f77d4699f37c22adfbfeb0c610a186b5f75e0/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13", size = 649304, upload_time = "2024-09-20T17:39:24.55Z" }, + { url = "https://files.pythonhosted.org/packages/f1/8e/d0aeffe69e53ccff5a28fa86f07ad1d2d2d6537a9506229431a2a02e2f15/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475", size = 646537, upload_time = "2024-09-20T17:44:31.102Z" }, + { url = "https://files.pythonhosted.org/packages/05/79/e15408220bbb989469c8871062c97c6c9136770657ba779711b90870d867/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b", size = 642506, upload_time = "2024-09-20T17:08:47.852Z" }, + { url = "https://files.pythonhosted.org/packages/18/87/470e01a940307796f1d25f8167b551a968540fbe0551c0ebb853cb527dd6/greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822", size = 602753, upload_time = "2024-09-20T17:08:38.079Z" }, + { url = "https://files.pythonhosted.org/packages/e2/72/576815ba674eddc3c25028238f74d7b8068902b3968cbe456771b166455e/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01", size = 1122731, upload_time = "2024-09-20T17:44:20.556Z" }, + { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112, upload_time = "2024-09-20T17:09:28.753Z" }, +] + +[[package]] +name = "grpcio" +version = "1.71.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/95/aa11fc09a85d91fbc7dd405dcb2a1e0256989d67bf89fa65ae24b3ba105a/grpcio-1.71.0.tar.gz", hash = "sha256:2b85f7820475ad3edec209d3d89a7909ada16caab05d3f2e08a7e8ae3200a55c", size = 12549828, upload_time = "2025-03-10T19:28:49.203Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/04/a085f3ad4133426f6da8c1becf0749872a49feb625a407a2e864ded3fb12/grpcio-1.71.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:d6aa986318c36508dc1d5001a3ff169a15b99b9f96ef5e98e13522c506b37eef", size = 5210453, upload_time = "2025-03-10T19:24:33.342Z" }, + { url = "https://files.pythonhosted.org/packages/b4/d5/0bc53ed33ba458de95020970e2c22aa8027b26cc84f98bea7fcad5d695d1/grpcio-1.71.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:d2c170247315f2d7e5798a22358e982ad6eeb68fa20cf7a820bb74c11f0736e7", size = 11347567, upload_time = "2025-03-10T19:24:35.215Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6d/ce334f7e7a58572335ccd61154d808fe681a4c5e951f8a1ff68f5a6e47ce/grpcio-1.71.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:e6f83a583ed0a5b08c5bc7a3fe860bb3c2eac1f03f1f63e0bc2091325605d2b7", size = 5696067, upload_time = "2025-03-10T19:24:37.988Z" }, + { url = "https://files.pythonhosted.org/packages/05/4a/80befd0b8b1dc2b9ac5337e57473354d81be938f87132e147c4a24a581bd/grpcio-1.71.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4be74ddeeb92cc87190e0e376dbc8fc7736dbb6d3d454f2fa1f5be1dee26b9d7", size = 6348377, upload_time = "2025-03-10T19:24:40.361Z" }, + { url = "https://files.pythonhosted.org/packages/c7/67/cbd63c485051eb78663355d9efd1b896cfb50d4a220581ec2cb9a15cd750/grpcio-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dd0dfbe4d5eb1fcfec9490ca13f82b089a309dc3678e2edabc144051270a66e", size = 5940407, upload_time = "2025-03-10T19:24:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/98/4b/7a11aa4326d7faa499f764eaf8a9b5a0eb054ce0988ee7ca34897c2b02ae/grpcio-1.71.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a2242d6950dc892afdf9e951ed7ff89473aaf744b7d5727ad56bdaace363722b", size = 6030915, upload_time = "2025-03-10T19:24:44.463Z" }, + { url = "https://files.pythonhosted.org/packages/eb/a2/cdae2d0e458b475213a011078b0090f7a1d87f9a68c678b76f6af7c6ac8c/grpcio-1.71.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0fa05ee31a20456b13ae49ad2e5d585265f71dd19fbd9ef983c28f926d45d0a7", size = 6648324, upload_time = "2025-03-10T19:24:46.287Z" }, + { url = "https://files.pythonhosted.org/packages/27/df/f345c8daaa8d8574ce9869f9b36ca220c8845923eb3087e8f317eabfc2a8/grpcio-1.71.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3d081e859fb1ebe176de33fc3adb26c7d46b8812f906042705346b314bde32c3", size = 6197839, upload_time = "2025-03-10T19:24:48.565Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2c/cd488dc52a1d0ae1bad88b0d203bc302efbb88b82691039a6d85241c5781/grpcio-1.71.0-cp311-cp311-win32.whl", hash = "sha256:d6de81c9c00c8a23047136b11794b3584cdc1460ed7cbc10eada50614baa1444", size = 3619978, upload_time = "2025-03-10T19:24:50.518Z" }, + { url = "https://files.pythonhosted.org/packages/ee/3f/cf92e7e62ccb8dbdf977499547dfc27133124d6467d3a7d23775bcecb0f9/grpcio-1.71.0-cp311-cp311-win_amd64.whl", hash = "sha256:24e867651fc67717b6f896d5f0cac0ec863a8b5fb7d6441c2ab428f52c651c6b", size = 4282279, upload_time = "2025-03-10T19:24:52.313Z" }, + { url = "https://files.pythonhosted.org/packages/4c/83/bd4b6a9ba07825bd19c711d8b25874cd5de72c2a3fbf635c3c344ae65bd2/grpcio-1.71.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:0ff35c8d807c1c7531d3002be03221ff9ae15712b53ab46e2a0b4bb271f38537", size = 5184101, upload_time = "2025-03-10T19:24:54.11Z" }, + { url = "https://files.pythonhosted.org/packages/31/ea/2e0d90c0853568bf714693447f5c73272ea95ee8dad107807fde740e595d/grpcio-1.71.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:b78a99cd1ece4be92ab7c07765a0b038194ded2e0a26fd654591ee136088d8d7", size = 11310927, upload_time = "2025-03-10T19:24:56.1Z" }, + { url = "https://files.pythonhosted.org/packages/ac/bc/07a3fd8af80467390af491d7dc66882db43884128cdb3cc8524915e0023c/grpcio-1.71.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc1a1231ed23caac1de9f943d031f1bc38d0f69d2a3b243ea0d664fc1fbd7fec", size = 5654280, upload_time = "2025-03-10T19:24:58.55Z" }, + { url = "https://files.pythonhosted.org/packages/16/af/21f22ea3eed3d0538b6ef7889fce1878a8ba4164497f9e07385733391e2b/grpcio-1.71.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6beeea5566092c5e3c4896c6d1d307fb46b1d4bdf3e70c8340b190a69198594", size = 6312051, upload_time = "2025-03-10T19:25:00.682Z" }, + { url = "https://files.pythonhosted.org/packages/49/9d/e12ddc726dc8bd1aa6cba67c85ce42a12ba5b9dd75d5042214a59ccf28ce/grpcio-1.71.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5170929109450a2c031cfe87d6716f2fae39695ad5335d9106ae88cc32dc84c", size = 5910666, upload_time = "2025-03-10T19:25:03.01Z" }, + { url = "https://files.pythonhosted.org/packages/d9/e9/38713d6d67aedef738b815763c25f092e0454dc58e77b1d2a51c9d5b3325/grpcio-1.71.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:5b08d03ace7aca7b2fadd4baf291139b4a5f058805a8327bfe9aece7253b6d67", size = 6012019, upload_time = "2025-03-10T19:25:05.174Z" }, + { url = "https://files.pythonhosted.org/packages/80/da/4813cd7adbae6467724fa46c952d7aeac5e82e550b1c62ed2aeb78d444ae/grpcio-1.71.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f903017db76bf9cc2b2d8bdd37bf04b505bbccad6be8a81e1542206875d0e9db", size = 6637043, upload_time = "2025-03-10T19:25:06.987Z" }, + { url = "https://files.pythonhosted.org/packages/52/ca/c0d767082e39dccb7985c73ab4cf1d23ce8613387149e9978c70c3bf3b07/grpcio-1.71.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:469f42a0b410883185eab4689060a20488a1a0a00f8bbb3cbc1061197b4c5a79", size = 6186143, upload_time = "2025-03-10T19:25:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/00/61/7b2c8ec13303f8fe36832c13d91ad4d4ba57204b1c723ada709c346b2271/grpcio-1.71.0-cp312-cp312-win32.whl", hash = "sha256:ad9f30838550695b5eb302add33f21f7301b882937460dd24f24b3cc5a95067a", size = 3604083, upload_time = "2025-03-10T19:25:10.736Z" }, + { url = "https://files.pythonhosted.org/packages/fd/7c/1e429c5fb26122055d10ff9a1d754790fb067d83c633ff69eddcf8e3614b/grpcio-1.71.0-cp312-cp312-win_amd64.whl", hash = "sha256:652350609332de6dac4ece254e5d7e1ff834e203d6afb769601f286886f6f3a8", size = 4272191, upload_time = "2025-03-10T19:25:13.12Z" }, + { url = "https://files.pythonhosted.org/packages/04/dd/b00cbb45400d06b26126dcfdbdb34bb6c4f28c3ebbd7aea8228679103ef6/grpcio-1.71.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:cebc1b34ba40a312ab480ccdb396ff3c529377a2fce72c45a741f7215bfe8379", size = 5184138, upload_time = "2025-03-10T19:25:15.101Z" }, + { url = "https://files.pythonhosted.org/packages/ed/0a/4651215983d590ef53aac40ba0e29dda941a02b097892c44fa3357e706e5/grpcio-1.71.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:85da336e3649a3d2171e82f696b5cad2c6231fdd5bad52616476235681bee5b3", size = 11310747, upload_time = "2025-03-10T19:25:17.201Z" }, + { url = "https://files.pythonhosted.org/packages/57/a3/149615b247f321e13f60aa512d3509d4215173bdb982c9098d78484de216/grpcio-1.71.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f9a412f55bb6e8f3bb000e020dbc1e709627dcb3a56f6431fa7076b4c1aab0db", size = 5653991, upload_time = "2025-03-10T19:25:20.39Z" }, + { url = "https://files.pythonhosted.org/packages/ca/56/29432a3e8d951b5e4e520a40cd93bebaa824a14033ea8e65b0ece1da6167/grpcio-1.71.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47be9584729534660416f6d2a3108aaeac1122f6b5bdbf9fd823e11fe6fbaa29", size = 6312781, upload_time = "2025-03-10T19:25:22.823Z" }, + { url = "https://files.pythonhosted.org/packages/a3/f8/286e81a62964ceb6ac10b10925261d4871a762d2a763fbf354115f9afc98/grpcio-1.71.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9c80ac6091c916db81131d50926a93ab162a7e97e4428ffc186b6e80d6dda4", size = 5910479, upload_time = "2025-03-10T19:25:24.828Z" }, + { url = "https://files.pythonhosted.org/packages/35/67/d1febb49ec0f599b9e6d4d0d44c2d4afdbed9c3e80deb7587ec788fcf252/grpcio-1.71.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:789d5e2a3a15419374b7b45cd680b1e83bbc1e52b9086e49308e2c0b5bbae6e3", size = 6013262, upload_time = "2025-03-10T19:25:26.987Z" }, + { url = "https://files.pythonhosted.org/packages/a1/04/f9ceda11755f0104a075ad7163fc0d96e2e3a9fe25ef38adfc74c5790daf/grpcio-1.71.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1be857615e26a86d7363e8a163fade914595c81fec962b3d514a4b1e8760467b", size = 6643356, upload_time = "2025-03-10T19:25:29.606Z" }, + { url = "https://files.pythonhosted.org/packages/fb/ce/236dbc3dc77cf9a9242adcf1f62538734ad64727fabf39e1346ad4bd5c75/grpcio-1.71.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a76d39b5fafd79ed604c4be0a869ec3581a172a707e2a8d7a4858cb05a5a7637", size = 6186564, upload_time = "2025-03-10T19:25:31.537Z" }, + { url = "https://files.pythonhosted.org/packages/10/fd/b3348fce9dd4280e221f513dd54024e765b21c348bc475516672da4218e9/grpcio-1.71.0-cp313-cp313-win32.whl", hash = "sha256:74258dce215cb1995083daa17b379a1a5a87d275387b7ffe137f1d5131e2cfbb", size = 3601890, upload_time = "2025-03-10T19:25:33.421Z" }, + { url = "https://files.pythonhosted.org/packages/be/f8/db5d5f3fc7e296166286c2a397836b8b042f7ad1e11028d82b061701f0f7/grpcio-1.71.0-cp313-cp313-win_amd64.whl", hash = "sha256:22c3bc8d488c039a199f7a003a38cb7635db6656fa96437a8accde8322ce2366", size = 4273308, upload_time = "2025-03-10T19:25:35.79Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload_time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload_time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "h5py" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/2e/a22d6a8bfa6f8be33e7febd985680fba531562795f0a9077ed1eb047bfb0/h5py-3.13.0.tar.gz", hash = "sha256:1870e46518720023da85d0895a1960ff2ce398c5671eac3b1a41ec696b7105c3", size = 414876, upload_time = "2025-02-18T16:04:01.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/2b/50b15fdefb577d073b49699e6ea6a0a77a3a1016c2b67e2149fc50124a10/h5py-3.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8a8e38ef4ceb969f832cc230c0cf808c613cc47e31e768fd7b1106c55afa1cb8", size = 3422922, upload_time = "2025-02-18T16:02:36.376Z" }, + { url = "https://files.pythonhosted.org/packages/94/59/36d87a559cab9c59b59088d52e86008d27a9602ce3afc9d3b51823014bf3/h5py-3.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f35640e81b03c02a88b8bf99fb6a9d3023cc52f7c627694db2f379e0028f2868", size = 2921619, upload_time = "2025-02-18T16:02:40.722Z" }, + { url = "https://files.pythonhosted.org/packages/37/ef/6f80b19682c0b0835bbee7b253bec9c16af9004f2fd6427b1dd858100273/h5py-3.13.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:337af114616f3656da0c83b68fcf53ecd9ce9989a700b0883a6e7c483c3235d4", size = 4259366, upload_time = "2025-02-18T16:02:44.544Z" }, + { url = "https://files.pythonhosted.org/packages/03/71/c99f662d4832c8835453cf3476f95daa28372023bda4aa1fca9e97c24f09/h5py-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:782ff0ac39f455f21fd1c8ebc007328f65f43d56718a89327eec76677ebf238a", size = 4509058, upload_time = "2025-02-18T16:02:49.035Z" }, + { url = "https://files.pythonhosted.org/packages/56/89/e3ff23e07131ff73a72a349be9639e4de84e163af89c1c218b939459a98a/h5py-3.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:22ffe2a25770a2d67213a1b94f58006c14dce06933a42d2aaa0318c5868d1508", size = 2966428, upload_time = "2025-02-18T16:02:52.061Z" }, + { url = "https://files.pythonhosted.org/packages/d8/20/438f6366ba4ded80eadb38f8927f5e2cd6d2e087179552f20ae3dbcd5d5b/h5py-3.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:477c58307b6b9a2509c59c57811afb9f598aedede24a67da808262dfa0ee37b4", size = 3384442, upload_time = "2025-02-18T16:02:56.545Z" }, + { url = "https://files.pythonhosted.org/packages/10/13/cc1cb7231399617d9951233eb12fddd396ff5d4f7f057ee5d2b1ca0ee7e7/h5py-3.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:57c4c74f627c616f02b7aec608a8c706fe08cb5b0ba7c08555a4eb1dde20805a", size = 2917567, upload_time = "2025-02-18T16:03:00.079Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d9/aed99e1c858dc698489f916eeb7c07513bc864885d28ab3689d572ba0ea0/h5py-3.13.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:357e6dc20b101a805ccfd0024731fbaf6e8718c18c09baf3b5e4e9d198d13fca", size = 4669544, upload_time = "2025-02-18T16:03:05.675Z" }, + { url = "https://files.pythonhosted.org/packages/a7/da/3c137006ff5f0433f0fb076b1ebe4a7bf7b5ee1e8811b5486af98b500dd5/h5py-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6f13f9b5ce549448c01e4dfe08ea8d1772e6078799af2c1c8d09e941230a90d", size = 4932139, upload_time = "2025-02-18T16:03:10.129Z" }, + { url = "https://files.pythonhosted.org/packages/25/61/d897952629cae131c19d4c41b2521e7dd6382f2d7177c87615c2e6dced1a/h5py-3.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:21daf38171753899b5905f3d82c99b0b1ec2cbbe282a037cad431feb620e62ec", size = 2954179, upload_time = "2025-02-18T16:03:13.716Z" }, + { url = "https://files.pythonhosted.org/packages/60/43/f276f27921919a9144074320ce4ca40882fc67b3cfee81c3f5c7df083e97/h5py-3.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e520ec76de00943dd017c8ea3f354fa1d2f542eac994811943a8faedf2a7d5cb", size = 3358040, upload_time = "2025-02-18T16:03:20.579Z" }, + { url = "https://files.pythonhosted.org/packages/1b/86/ad4a4cf781b08d4572be8bbdd8f108bb97b266a14835c640dc43dafc0729/h5py-3.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e79d8368cd9295045956bfb436656bea3f915beaa11d342e9f79f129f5178763", size = 2892766, upload_time = "2025-02-18T16:03:26.831Z" }, + { url = "https://files.pythonhosted.org/packages/69/84/4c6367d6b58deaf0fa84999ec819e7578eee96cea6cbd613640d0625ed5e/h5py-3.13.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56dd172d862e850823c4af02dc4ddbc308f042b85472ffdaca67f1598dff4a57", size = 4664255, upload_time = "2025-02-18T16:03:31.903Z" }, + { url = "https://files.pythonhosted.org/packages/fd/41/bc2df86b72965775f6d621e0ee269a5f3ac23e8f870abf519de9c7d93b4d/h5py-3.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be949b46b7388074c5acae017fbbe3e5ba303fd9daaa52157fdfef30bbdacadd", size = 4927580, upload_time = "2025-02-18T16:03:36.429Z" }, + { url = "https://files.pythonhosted.org/packages/97/34/165b87ea55184770a0c1fcdb7e017199974ad2e271451fd045cfe35f3add/h5py-3.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:4f97ecde7ac6513b21cd95efdfc38dc6d19f96f6ca6f2a30550e94e551458e0a", size = 2940890, upload_time = "2025-02-18T16:03:41.037Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload_time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload_time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httplib2" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyparsing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/ad/2371116b22d616c194aa25ec410c9c6c37f23599dcd590502b74db197584/httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81", size = 351116, upload_time = "2023-03-21T22:29:37.214Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/6c/d2fbdaaa5959339d53ba38e94c123e4e84b8fbc4b84beb0e70d7c1608486/httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc", size = 96854, upload_time = "2023-03-21T22:29:35.683Z" }, +] + +[[package]] +name = "httptools" +version = "0.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/9a/ce5e1f7e131522e6d3426e8e7a490b3a01f39a6696602e1c4f33f9e94277/httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c", size = 240639, upload_time = "2024-10-16T19:45:08.902Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/26/bb526d4d14c2774fe07113ca1db7255737ffbb119315839af2065abfdac3/httptools-0.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069", size = 199029, upload_time = "2024-10-16T19:44:18.427Z" }, + { url = "https://files.pythonhosted.org/packages/a6/17/3e0d3e9b901c732987a45f4f94d4e2c62b89a041d93db89eafb262afd8d5/httptools-0.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a", size = 103492, upload_time = "2024-10-16T19:44:19.515Z" }, + { url = "https://files.pythonhosted.org/packages/b7/24/0fe235d7b69c42423c7698d086d4db96475f9b50b6ad26a718ef27a0bce6/httptools-0.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975", size = 462891, upload_time = "2024-10-16T19:44:21.067Z" }, + { url = "https://files.pythonhosted.org/packages/b1/2f/205d1f2a190b72da6ffb5f41a3736c26d6fa7871101212b15e9b5cd8f61d/httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636", size = 459788, upload_time = "2024-10-16T19:44:22.958Z" }, + { url = "https://files.pythonhosted.org/packages/6e/4c/d09ce0eff09057a206a74575ae8f1e1e2f0364d20e2442224f9e6612c8b9/httptools-0.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721", size = 433214, upload_time = "2024-10-16T19:44:24.513Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/84c9e23edbccc4a4c6f96a1b8d99dfd2350289e94f00e9ccc7aadde26fb5/httptools-0.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988", size = 434120, upload_time = "2024-10-16T19:44:26.295Z" }, + { url = "https://files.pythonhosted.org/packages/d0/46/4d8e7ba9581416de1c425b8264e2cadd201eb709ec1584c381f3e98f51c1/httptools-0.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17", size = 88565, upload_time = "2024-10-16T19:44:29.188Z" }, + { url = "https://files.pythonhosted.org/packages/bb/0e/d0b71465c66b9185f90a091ab36389a7352985fe857e352801c39d6127c8/httptools-0.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2", size = 200683, upload_time = "2024-10-16T19:44:30.175Z" }, + { url = "https://files.pythonhosted.org/packages/e2/b8/412a9bb28d0a8988de3296e01efa0bd62068b33856cdda47fe1b5e890954/httptools-0.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44", size = 104337, upload_time = "2024-10-16T19:44:31.786Z" }, + { url = "https://files.pythonhosted.org/packages/9b/01/6fb20be3196ffdc8eeec4e653bc2a275eca7f36634c86302242c4fbb2760/httptools-0.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1", size = 508796, upload_time = "2024-10-16T19:44:32.825Z" }, + { url = "https://files.pythonhosted.org/packages/f7/d8/b644c44acc1368938317d76ac991c9bba1166311880bcc0ac297cb9d6bd7/httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2", size = 510837, upload_time = "2024-10-16T19:44:33.974Z" }, + { url = "https://files.pythonhosted.org/packages/52/d8/254d16a31d543073a0e57f1c329ca7378d8924e7e292eda72d0064987486/httptools-0.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81", size = 485289, upload_time = "2024-10-16T19:44:35.111Z" }, + { url = "https://files.pythonhosted.org/packages/5f/3c/4aee161b4b7a971660b8be71a92c24d6c64372c1ab3ae7f366b3680df20f/httptools-0.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f", size = 489779, upload_time = "2024-10-16T19:44:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/12/b7/5cae71a8868e555f3f67a50ee7f673ce36eac970f029c0c5e9d584352961/httptools-0.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970", size = 88634, upload_time = "2024-10-16T19:44:37.357Z" }, + { url = "https://files.pythonhosted.org/packages/94/a3/9fe9ad23fd35f7de6b91eeb60848986058bd8b5a5c1e256f5860a160cc3e/httptools-0.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660", size = 197214, upload_time = "2024-10-16T19:44:38.738Z" }, + { url = "https://files.pythonhosted.org/packages/ea/d9/82d5e68bab783b632023f2fa31db20bebb4e89dfc4d2293945fd68484ee4/httptools-0.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083", size = 102431, upload_time = "2024-10-16T19:44:39.818Z" }, + { url = "https://files.pythonhosted.org/packages/96/c1/cb499655cbdbfb57b577734fde02f6fa0bbc3fe9fb4d87b742b512908dff/httptools-0.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3", size = 473121, upload_time = "2024-10-16T19:44:41.189Z" }, + { url = "https://files.pythonhosted.org/packages/af/71/ee32fd358f8a3bb199b03261f10921716990808a675d8160b5383487a317/httptools-0.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071", size = 473805, upload_time = "2024-10-16T19:44:42.384Z" }, + { url = "https://files.pythonhosted.org/packages/8a/0a/0d4df132bfca1507114198b766f1737d57580c9ad1cf93c1ff673e3387be/httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5", size = 448858, upload_time = "2024-10-16T19:44:43.959Z" }, + { url = "https://files.pythonhosted.org/packages/1e/6a/787004fdef2cabea27bad1073bf6a33f2437b4dbd3b6fb4a9d71172b1c7c/httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0", size = 452042, upload_time = "2024-10-16T19:44:45.071Z" }, + { url = "https://files.pythonhosted.org/packages/4d/dc/7decab5c404d1d2cdc1bb330b1bf70e83d6af0396fd4fc76fc60c0d522bf/httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8", size = 87682, upload_time = "2024-10-16T19:44:46.46Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload_time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload_time = "2024-12-06T15:37:21.509Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074, upload_time = "2025-01-30T13:45:41.519Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068, upload_time = "2025-01-30T13:45:39.514Z" }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload_time = "2025-03-19T20:09:59.721Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload_time = "2025-03-19T20:10:01.071Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload_time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload_time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "jiter" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/c2/e4562507f52f0af7036da125bb699602ead37a2332af0788f8e0a3417f36/jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893", size = 162604, upload_time = "2025-03-10T21:37:03.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/44/e241a043f114299254e44d7e777ead311da400517f179665e59611ab0ee4/jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af", size = 314654, upload_time = "2025-03-10T21:35:23.939Z" }, + { url = "https://files.pythonhosted.org/packages/fb/1b/a7e5e42db9fa262baaa9489d8d14ca93f8663e7f164ed5e9acc9f467fc00/jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58", size = 320909, upload_time = "2025-03-10T21:35:26.127Z" }, + { url = "https://files.pythonhosted.org/packages/60/bf/8ebdfce77bc04b81abf2ea316e9c03b4a866a7d739cf355eae4d6fd9f6fe/jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b", size = 341733, upload_time = "2025-03-10T21:35:27.94Z" }, + { url = "https://files.pythonhosted.org/packages/a8/4e/754ebce77cff9ab34d1d0fa0fe98f5d42590fd33622509a3ba6ec37ff466/jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b", size = 365097, upload_time = "2025-03-10T21:35:29.605Z" }, + { url = "https://files.pythonhosted.org/packages/32/2c/6019587e6f5844c612ae18ca892f4cd7b3d8bbf49461ed29e384a0f13d98/jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5", size = 406603, upload_time = "2025-03-10T21:35:31.696Z" }, + { url = "https://files.pythonhosted.org/packages/da/e9/c9e6546c817ab75a1a7dab6dcc698e62e375e1017113e8e983fccbd56115/jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572", size = 396625, upload_time = "2025-03-10T21:35:33.182Z" }, + { url = "https://files.pythonhosted.org/packages/be/bd/976b458add04271ebb5a255e992bd008546ea04bb4dcadc042a16279b4b4/jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15", size = 351832, upload_time = "2025-03-10T21:35:35.394Z" }, + { url = "https://files.pythonhosted.org/packages/07/51/fe59e307aaebec9265dbad44d9d4381d030947e47b0f23531579b9a7c2df/jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419", size = 384590, upload_time = "2025-03-10T21:35:37.171Z" }, + { url = "https://files.pythonhosted.org/packages/db/55/5dcd2693794d8e6f4889389ff66ef3be557a77f8aeeca8973a97a7c00557/jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043", size = 520690, upload_time = "2025-03-10T21:35:38.717Z" }, + { url = "https://files.pythonhosted.org/packages/54/d5/9f51dc90985e9eb251fbbb747ab2b13b26601f16c595a7b8baba964043bd/jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965", size = 512649, upload_time = "2025-03-10T21:35:40.157Z" }, + { url = "https://files.pythonhosted.org/packages/a6/e5/4e385945179bcf128fa10ad8dca9053d717cbe09e258110e39045c881fe5/jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2", size = 206920, upload_time = "2025-03-10T21:35:41.72Z" }, + { url = "https://files.pythonhosted.org/packages/4c/47/5e0b94c603d8e54dd1faab439b40b832c277d3b90743e7835879ab663757/jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd", size = 210119, upload_time = "2025-03-10T21:35:43.46Z" }, + { url = "https://files.pythonhosted.org/packages/af/d7/c55086103d6f29b694ec79156242304adf521577530d9031317ce5338c59/jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11", size = 309203, upload_time = "2025-03-10T21:35:44.852Z" }, + { url = "https://files.pythonhosted.org/packages/b0/01/f775dfee50beb420adfd6baf58d1c4d437de41c9b666ddf127c065e5a488/jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e", size = 319678, upload_time = "2025-03-10T21:35:46.365Z" }, + { url = "https://files.pythonhosted.org/packages/ab/b8/09b73a793714726893e5d46d5c534a63709261af3d24444ad07885ce87cb/jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2", size = 341816, upload_time = "2025-03-10T21:35:47.856Z" }, + { url = "https://files.pythonhosted.org/packages/35/6f/b8f89ec5398b2b0d344257138182cc090302854ed63ed9c9051e9c673441/jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75", size = 364152, upload_time = "2025-03-10T21:35:49.397Z" }, + { url = "https://files.pythonhosted.org/packages/9b/ca/978cc3183113b8e4484cc7e210a9ad3c6614396e7abd5407ea8aa1458eef/jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d", size = 406991, upload_time = "2025-03-10T21:35:50.745Z" }, + { url = "https://files.pythonhosted.org/packages/13/3a/72861883e11a36d6aa314b4922125f6ae90bdccc225cd96d24cc78a66385/jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42", size = 395824, upload_time = "2025-03-10T21:35:52.162Z" }, + { url = "https://files.pythonhosted.org/packages/87/67/22728a86ef53589c3720225778f7c5fdb617080e3deaed58b04789418212/jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc", size = 351318, upload_time = "2025-03-10T21:35:53.566Z" }, + { url = "https://files.pythonhosted.org/packages/69/b9/f39728e2e2007276806d7a6609cda7fac44ffa28ca0d02c49a4f397cc0d9/jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc", size = 384591, upload_time = "2025-03-10T21:35:54.95Z" }, + { url = "https://files.pythonhosted.org/packages/eb/8f/8a708bc7fd87b8a5d861f1c118a995eccbe6d672fe10c9753e67362d0dd0/jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e", size = 520746, upload_time = "2025-03-10T21:35:56.444Z" }, + { url = "https://files.pythonhosted.org/packages/95/1e/65680c7488bd2365dbd2980adaf63c562d3d41d3faac192ebc7ef5b4ae25/jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d", size = 512754, upload_time = "2025-03-10T21:35:58.789Z" }, + { url = "https://files.pythonhosted.org/packages/78/f3/fdc43547a9ee6e93c837685da704fb6da7dba311fc022e2766d5277dfde5/jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06", size = 207075, upload_time = "2025-03-10T21:36:00.616Z" }, + { url = "https://files.pythonhosted.org/packages/cd/9d/742b289016d155f49028fe1bfbeb935c9bf0ffeefdf77daf4a63a42bb72b/jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0", size = 207999, upload_time = "2025-03-10T21:36:02.366Z" }, + { url = "https://files.pythonhosted.org/packages/e7/1b/4cd165c362e8f2f520fdb43245e2b414f42a255921248b4f8b9c8d871ff1/jiter-0.9.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:2764891d3f3e8b18dce2cff24949153ee30c9239da7c00f032511091ba688ff7", size = 308197, upload_time = "2025-03-10T21:36:03.828Z" }, + { url = "https://files.pythonhosted.org/packages/13/aa/7a890dfe29c84c9a82064a9fe36079c7c0309c91b70c380dc138f9bea44a/jiter-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:387b22fbfd7a62418d5212b4638026d01723761c75c1c8232a8b8c37c2f1003b", size = 318160, upload_time = "2025-03-10T21:36:05.281Z" }, + { url = "https://files.pythonhosted.org/packages/6a/38/5888b43fc01102f733f085673c4f0be5a298f69808ec63de55051754e390/jiter-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d8da8629ccae3606c61d9184970423655fb4e33d03330bcdfe52d234d32f69", size = 341259, upload_time = "2025-03-10T21:36:06.716Z" }, + { url = "https://files.pythonhosted.org/packages/3d/5e/bbdbb63305bcc01006de683b6228cd061458b9b7bb9b8d9bc348a58e5dc2/jiter-0.9.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1be73d8982bdc278b7b9377426a4b44ceb5c7952073dd7488e4ae96b88e1103", size = 363730, upload_time = "2025-03-10T21:36:08.138Z" }, + { url = "https://files.pythonhosted.org/packages/75/85/53a3edc616992fe4af6814c25f91ee3b1e22f7678e979b6ea82d3bc0667e/jiter-0.9.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2228eaaaa111ec54b9e89f7481bffb3972e9059301a878d085b2b449fbbde635", size = 405126, upload_time = "2025-03-10T21:36:10.934Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b3/1ee26b12b2693bd3f0b71d3188e4e5d817b12e3c630a09e099e0a89e28fa/jiter-0.9.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:11509bfecbc319459647d4ac3fd391d26fdf530dad00c13c4dadabf5b81f01a4", size = 393668, upload_time = "2025-03-10T21:36:12.468Z" }, + { url = "https://files.pythonhosted.org/packages/11/87/e084ce261950c1861773ab534d49127d1517b629478304d328493f980791/jiter-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f22238da568be8bbd8e0650e12feeb2cfea15eda4f9fc271d3b362a4fa0604d", size = 352350, upload_time = "2025-03-10T21:36:14.148Z" }, + { url = "https://files.pythonhosted.org/packages/f0/06/7dca84b04987e9df563610aa0bc154ea176e50358af532ab40ffb87434df/jiter-0.9.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17f5d55eb856597607562257c8e36c42bc87f16bef52ef7129b7da11afc779f3", size = 384204, upload_time = "2025-03-10T21:36:15.545Z" }, + { url = "https://files.pythonhosted.org/packages/16/2f/82e1c6020db72f397dd070eec0c85ebc4df7c88967bc86d3ce9864148f28/jiter-0.9.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:6a99bed9fbb02f5bed416d137944419a69aa4c423e44189bc49718859ea83bc5", size = 520322, upload_time = "2025-03-10T21:36:17.016Z" }, + { url = "https://files.pythonhosted.org/packages/36/fd/4f0cd3abe83ce208991ca61e7e5df915aa35b67f1c0633eb7cf2f2e88ec7/jiter-0.9.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e057adb0cd1bd39606100be0eafe742de2de88c79df632955b9ab53a086b3c8d", size = 512184, upload_time = "2025-03-10T21:36:18.47Z" }, + { url = "https://files.pythonhosted.org/packages/a0/3c/8a56f6d547731a0b4410a2d9d16bf39c861046f91f57c98f7cab3d2aa9ce/jiter-0.9.0-cp313-cp313-win32.whl", hash = "sha256:f7e6850991f3940f62d387ccfa54d1a92bd4bb9f89690b53aea36b4364bcab53", size = 206504, upload_time = "2025-03-10T21:36:19.809Z" }, + { url = "https://files.pythonhosted.org/packages/f4/1c/0c996fd90639acda75ed7fa698ee5fd7d80243057185dc2f63d4c1c9f6b9/jiter-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:c8ae3bf27cd1ac5e6e8b7a27487bf3ab5f82318211ec2e1346a5b058756361f7", size = 204943, upload_time = "2025-03-10T21:36:21.536Z" }, + { url = "https://files.pythonhosted.org/packages/78/0f/77a63ca7aa5fed9a1b9135af57e190d905bcd3702b36aca46a01090d39ad/jiter-0.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f0b2827fb88dda2cbecbbc3e596ef08d69bda06c6f57930aec8e79505dc17001", size = 317281, upload_time = "2025-03-10T21:36:22.959Z" }, + { url = "https://files.pythonhosted.org/packages/f9/39/a3a1571712c2bf6ec4c657f0d66da114a63a2e32b7e4eb8e0b83295ee034/jiter-0.9.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:062b756ceb1d40b0b28f326cba26cfd575a4918415b036464a52f08632731e5a", size = 350273, upload_time = "2025-03-10T21:36:24.414Z" }, + { url = "https://files.pythonhosted.org/packages/ee/47/3729f00f35a696e68da15d64eb9283c330e776f3b5789bac7f2c0c4df209/jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf", size = 206867, upload_time = "2025-03-10T21:36:25.843Z" }, +] + +[[package]] +name = "keras" +version = "2.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/03/80072f4ee46e3c77e95b06d684fadf90a67759e4e9f1d86a563e0965c71a/keras-2.15.0.tar.gz", hash = "sha256:81871d298c064dc4ac6b58440fdae67bfcf47c8d7ad28580fab401834c06a575", size = 1252015, upload_time = "2023-11-07T00:39:57.716Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/a7/0d4490de967a67f68a538cc9cdb259bff971c4b5787f7765dc7c8f118f71/keras-2.15.0-py3-none-any.whl", hash = "sha256:2dcc6d2e30cf9c951064b63c1f4c404b966c59caf09e01f3549138ec8ee0dd1f", size = 1710438, upload_time = "2023-11-07T00:39:55.57Z" }, +] + +[[package]] +name = "libclang" +version = "18.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/ca35e19a4f142adffa27e3d652196b7362fa612243e2b916845d801454fc/libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250", size = 39612, upload_time = "2024-03-17T16:04:37.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/49/f5e3e7e1419872b69f6f5e82ba56e33955a74bd537d8a1f5f1eff2f3668a/libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a", size = 25836045, upload_time = "2024-06-30T17:40:31.646Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e5/fc61bbded91a8830ccce94c5294ecd6e88e496cc85f6704bf350c0634b70/libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5", size = 26502641, upload_time = "2024-03-18T15:52:26.722Z" }, + { url = "https://files.pythonhosted.org/packages/db/ed/1df62b44db2583375f6a8a5e2ca5432bbdc3edb477942b9b7c848c720055/libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8", size = 26420207, upload_time = "2024-03-17T15:00:26.63Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fc/716c1e62e512ef1c160e7984a73a5fc7df45166f2ff3f254e71c58076f7c/libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b", size = 24515943, upload_time = "2024-03-17T16:03:45.942Z" }, + { url = "https://files.pythonhosted.org/packages/3c/3d/f0ac1150280d8d20d059608cf2d5ff61b7c3b7f7bcf9c0f425ab92df769a/libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592", size = 23784972, upload_time = "2024-03-17T16:12:47.677Z" }, + { url = "https://files.pythonhosted.org/packages/fe/2f/d920822c2b1ce9326a4c78c0c2b4aa3fde610c7ee9f631b600acb5376c26/libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe", size = 20259606, upload_time = "2024-03-17T16:17:42.437Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c2/de1db8c6d413597076a4259cea409b83459b2db997c003578affdd32bf66/libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f", size = 24921494, upload_time = "2024-03-17T16:14:20.132Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2d/3f480b1e1d31eb3d6de5e3ef641954e5c67430d5ac93b7fa7e07589576c7/libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb", size = 26415083, upload_time = "2024-03-17T16:42:21.703Z" }, + { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112, upload_time = "2024-03-17T16:42:59.565Z" }, +] + +[[package]] +name = "lxml" +version = "5.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/a6/0730ff6cbb87e42e1329a486fe4ccbd3f8f728cb629c2671b0d093a85918/lxml-5.1.1.tar.gz", hash = "sha256:42a8aa957e98bd8b884a8142175ec24ce4ef0a57760e8879f193bfe64b757ca9", size = 3838907, upload_time = "2024-03-29T06:46:52.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/01/977ac832ec441dbde7b373faef715d8f58c4052cc88ae01070be7f3d7907/lxml-5.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:906966babd374fdfe46e130fc656488003f0d0d63b7cba612aa5a796c8804283", size = 8756105, upload_time = "2024-03-29T06:43:08.757Z" }, + { url = "https://files.pythonhosted.org/packages/7e/0a/8ef5c87c72ba4d9a8765c829d1abc28c8482ade37735c7c2725221243d3d/lxml-5.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9c03f3715c68fc707d9383d56e482d95d198ba07cb3dad4aee9e5a5ca06b2536", size = 4751802, upload_time = "2024-03-29T06:43:13.165Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2a/9096d632371ce48dafcd0459520c9afd60d3b26b6c00a5d3f8e93fdb089d/lxml-5.1.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d26243d994d4077a50056e9008848e5b421be0c6f0fd4e932a9463e1d89fc42b", size = 5202069, upload_time = "2024-03-29T06:43:16.426Z" }, + { url = "https://files.pythonhosted.org/packages/7e/03/dea246cbe3d959062751ec1aa031972e61680ae4a60c67df08bb1305b465/lxml-5.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de00750318ae6869b9dfa6429a4f82b8ecad043049414547474d09db549c2ee", size = 4921442, upload_time = "2024-03-29T06:43:19.842Z" }, + { url = "https://files.pythonhosted.org/packages/84/71/0d510fe3f99a8ddb776d7b803ed1f41b9eb64b30c5f945f241edf238adfa/lxml-5.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29b2771b4eec4e85063f10294facdd9829d010e6cc9668040d0cf936dc56733a", size = 5084186, upload_time = "2024-03-29T06:43:23.7Z" }, + { url = "https://files.pythonhosted.org/packages/de/c3/9fb0276ad05f3dc454d2f8165181039da4cbfb605f53816d7f34d5e93cca/lxml-5.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d9358f7268c161dc0a1c3216018f26c04954b5dd47ba6dead79da6598f4725d4", size = 4962146, upload_time = "2024-03-29T06:43:27.312Z" }, + { url = "https://files.pythonhosted.org/packages/eb/4f/533dd6ece9f4aa2c8455244c074f61facb23944271cc82bcceccc1eca8a1/lxml-5.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8a943826e7a9254eed661a7134fcde3c832a9fecd989d0f47c6e08c7b769cb2c", size = 5094316, upload_time = "2024-03-29T06:43:30.877Z" }, + { url = "https://files.pythonhosted.org/packages/f9/bd/62cc8a995bd34b1f44fc3706bab0c21bde489dc56482a5f4c9a6bb11ff65/lxml-5.1.1-cp311-cp311-win32.whl", hash = "sha256:74d0967c6f91eec6fe91159f9e8ccb3720fa0fbf9f462109c7bef62550df397c", size = 3560964, upload_time = "2024-03-29T06:43:34.609Z" }, + { url = "https://files.pythonhosted.org/packages/02/7e/af62091cc2c3096573458cec140a914b54f4b36892f549449cc556ed34cb/lxml-5.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:26974096654241df08a30dc2eb0e139c1ad5653660aa4b2ced66000230e96c14", size = 3909680, upload_time = "2024-03-29T06:43:38.221Z" }, + { url = "https://files.pythonhosted.org/packages/e3/0a/3901402aef812c57c27d1bb5405a29abb345fbd7e1b595d060bb065e46c6/lxml-5.1.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:55e13a19829dcdbf0c5233062977aeb6daf72e65124909128045976f659164e8", size = 8786323, upload_time = "2024-03-29T06:43:42.886Z" }, + { url = "https://files.pythonhosted.org/packages/04/92/74df36e8ccecdc96260531f0cbbf849ed25d3ff77a5655a3c89d588e982d/lxml-5.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:adedfb61be862f48907218e3a24bf051fd2ecca53358f3958b0bdb17d7881c20", size = 4764866, upload_time = "2024-03-29T06:43:46.476Z" }, + { url = "https://files.pythonhosted.org/packages/7f/b2/5dfbbec91014ffac561d51d4e3467587a646572f111fd7ddd076568d34c7/lxml-5.1.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:77425482e4311d1cff119a2b5ab26c52ec209d2a3d728a54db3223ab91995e20", size = 5153741, upload_time = "2024-03-29T06:43:50.444Z" }, + { url = "https://files.pythonhosted.org/packages/5b/cf/3da2e345dd19b509c9d269000f16888f4ef50f8ca742c268f8142a7e0b84/lxml-5.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d380f183bd03ab827899753ea96dabe27d2025eb0bfd4f2ac0eee4afa0f351d", size = 4853573, upload_time = "2024-03-29T06:43:53.449Z" }, + { url = "https://files.pythonhosted.org/packages/67/78/aad9c76bf995febcacd836e12ecc670c89737502ebe44f69c472918c8ffd/lxml-5.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8682af96b5ad5093aab9eee5e4ff24cb7a9796c78699d914dd456ebfe7484a6", size = 5035899, upload_time = "2024-03-29T06:43:57.342Z" }, + { url = "https://files.pythonhosted.org/packages/7e/88/d0cb086fb1b72fec96bb45aad1058ec31b9df3b146245747c0601490428b/lxml-5.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:68eed33377a9925aed7ba56c8611d50aaa1e45638c07a92b4b4b0a0436cc2dd2", size = 4896851, upload_time = "2024-03-29T06:44:00.471Z" }, + { url = "https://files.pythonhosted.org/packages/5d/69/8cb0a076851dcc5fa185042d3f19e61edb596d677280085873fd49043529/lxml-5.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c7c1d2f6e9c7a1c4478146ee38d16dbe0eb3be998424bc0f01346c671c38b86d", size = 5048250, upload_time = "2024-03-29T06:44:04.436Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d7/a3d5f104c46231060d3e177ad946bf5c0bbc5652f960fbf2dedb66f0f9f7/lxml-5.1.1-cp312-cp312-win32.whl", hash = "sha256:81107c8de3e463052ae8fd05fd31b97c371c7a9ce4a189b8bb5f45b0b3545fb9", size = 3571032, upload_time = "2024-03-29T06:44:07.247Z" }, + { url = "https://files.pythonhosted.org/packages/d6/6b/a7c513c461b1448122d27faeb8f4b61150777816303a21fa6f9bb8be3266/lxml-5.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e46181d15fae102c53621bed9356b7a599a1e837b978c934a350dd00842b1d9", size = 3909299, upload_time = "2024-03-29T06:44:11.354Z" }, +] + +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload_time = "2025-04-10T12:44:31.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload_time = "2025-04-10T12:50:53.297Z" }, +] + +[[package]] +name = "markdown" +version = "3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/15/222b423b0b88689c266d9eac4e61396fe2cc53464459d6a37618ac863b24/markdown-3.8.tar.gz", hash = "sha256:7df81e63f0df5c4b24b7d156eb81e4690595239b7d70937d0409f1b0de319c6f", size = 360906, upload_time = "2025-04-11T14:42:50.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/3f/afe76f8e2246ffbc867440cbcf90525264df0e658f8a5ca1f872b3f6192a/markdown-3.8-py3-none-any.whl", hash = "sha256:794a929b79c5af141ef5ab0f2f642d0f7b1872981250230e72682346f7cc90dc", size = 106210, upload_time = "2025-04-11T14:42:49.178Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload_time = "2023-06-03T06:41:14.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload_time = "2023-06-03T06:41:11.019Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload_time = "2024-10-18T15:21:54.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353, upload_time = "2024-10-18T15:21:02.187Z" }, + { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392, upload_time = "2024-10-18T15:21:02.941Z" }, + { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984, upload_time = "2024-10-18T15:21:03.953Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120, upload_time = "2024-10-18T15:21:06.495Z" }, + { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032, upload_time = "2024-10-18T15:21:07.295Z" }, + { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057, upload_time = "2024-10-18T15:21:08.073Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359, upload_time = "2024-10-18T15:21:09.318Z" }, + { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306, upload_time = "2024-10-18T15:21:10.185Z" }, + { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094, upload_time = "2024-10-18T15:21:11.005Z" }, + { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521, upload_time = "2024-10-18T15:21:12.911Z" }, + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274, upload_time = "2024-10-18T15:21:13.777Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348, upload_time = "2024-10-18T15:21:14.822Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149, upload_time = "2024-10-18T15:21:15.642Z" }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118, upload_time = "2024-10-18T15:21:17.133Z" }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993, upload_time = "2024-10-18T15:21:18.064Z" }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178, upload_time = "2024-10-18T15:21:18.859Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319, upload_time = "2024-10-18T15:21:19.671Z" }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352, upload_time = "2024-10-18T15:21:20.971Z" }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097, upload_time = "2024-10-18T15:21:22.646Z" }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601, upload_time = "2024-10-18T15:21:23.499Z" }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274, upload_time = "2024-10-18T15:21:24.577Z" }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352, upload_time = "2024-10-18T15:21:25.382Z" }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122, upload_time = "2024-10-18T15:21:26.199Z" }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085, upload_time = "2024-10-18T15:21:27.029Z" }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978, upload_time = "2024-10-18T15:21:27.846Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208, upload_time = "2024-10-18T15:21:28.744Z" }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357, upload_time = "2024-10-18T15:21:29.545Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344, upload_time = "2024-10-18T15:21:30.366Z" }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101, upload_time = "2024-10-18T15:21:31.207Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603, upload_time = "2024-10-18T15:21:32.032Z" }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510, upload_time = "2024-10-18T15:21:33.625Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486, upload_time = "2024-10-18T15:21:34.611Z" }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480, upload_time = "2024-10-18T15:21:35.398Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914, upload_time = "2024-10-18T15:21:36.231Z" }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796, upload_time = "2024-10-18T15:21:37.073Z" }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473, upload_time = "2024-10-18T15:21:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114, upload_time = "2024-10-18T15:21:39.799Z" }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098, upload_time = "2024-10-18T15:21:40.813Z" }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208, upload_time = "2024-10-18T15:21:41.814Z" }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload_time = "2024-10-18T15:21:42.784Z" }, +] + +[[package]] +name = "marshmallow" +version = "3.23.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/c0/d674c9de69227beafa41e1601b0c48b8b51060212abc231d4332e4b1e794/marshmallow-3.23.3.tar.gz", hash = "sha256:d586c8685ebdb80bf754e1f96e3f305aaf30951f1fc69175b977453633467e76", size = 175606, upload_time = "2025-01-03T20:18:41.52Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/82/d8c37cc92948ce11e5d8d71602bbac7ac4257f9e1f918fd91b1ddac4ec97/marshmallow-3.23.3-py3-none-any.whl", hash = "sha256:20c0f8c613f68bcb45b2a0d3282e2f172575560170bf220d67aafb42717910e4", size = 48911, upload_time = "2025-01-03T20:18:39.62Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload_time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload_time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "ml-dtypes" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/39/7d/8d85fcba868758b3a546e6914e727abd8f29ea6918079f816975c9eecd63/ml_dtypes-0.3.2.tar.gz", hash = "sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967", size = 692014, upload_time = "2024-01-03T19:21:23.615Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/a4/6aabb78f1569550fd77c74d2c1d008b502c8ce72776bd88b14ea6c182c9e/ml_dtypes-0.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe", size = 389791, upload_time = "2024-01-03T19:21:02.844Z" }, + { url = "https://files.pythonhosted.org/packages/d1/ed/211bf2e1c66e4ec9b712c3be848a876185c7f0d5e94bf647b60e64ef32eb/ml_dtypes-0.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462", size = 2185796, upload_time = "2024-01-03T19:21:04.291Z" }, + { url = "https://files.pythonhosted.org/packages/77/a0/d4ee9e3aca5b9101c590b58555820618e8201c2ccb7004eabb417ec046ac/ml_dtypes-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226", size = 2164071, upload_time = "2024-01-03T19:21:05.78Z" }, + { url = "https://files.pythonhosted.org/packages/a4/db/1784b87285588788170f87e987bfb4bda218d62a70a81ebb66c94e7f9b95/ml_dtypes-0.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655", size = 127681, upload_time = "2024-01-03T19:21:07.337Z" }, + { url = "https://files.pythonhosted.org/packages/ad/2d/57a8aa1ba7472a93a675bfba3f0c90d9396d01d040617a5345ce87884330/ml_dtypes-0.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18", size = 393571, upload_time = "2024-01-03T19:21:08.836Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/ec30199c791cf0d788a26f56d8efb8ee4133ede79a9680fd8cc05e706404/ml_dtypes-0.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33", size = 2180925, upload_time = "2024-01-03T19:21:10.87Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/93219c44bae4017e6e43391fa4433592de08e05def9d885227d3596f21a5/ml_dtypes-0.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855", size = 2160573, upload_time = "2024-01-03T19:21:12.775Z" }, + { url = "https://files.pythonhosted.org/packages/47/f3/847da54c3d243ff2aa778078ecf09da199194d282744718ef325dd8afd41/ml_dtypes-0.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4", size = 128649, upload_time = "2024-01-03T19:21:14.312Z" }, +] + +[[package]] +name = "multidict" +version = "6.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/da/2c/e367dfb4c6538614a0c9453e510d75d66099edf1c4e69da1b5ce691a1931/multidict-6.4.3.tar.gz", hash = "sha256:3ada0b058c9f213c5f95ba301f922d402ac234f1111a7d8fd70f1b99f3c281ec", size = 89372, upload_time = "2025-04-10T22:20:17.956Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e0/53cf7f27eda48fffa53cfd4502329ed29e00efb9e4ce41362cbf8aa54310/multidict-6.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f6f19170197cc29baccd33ccc5b5d6a331058796485857cf34f7635aa25fb0cd", size = 65259, upload_time = "2025-04-10T22:17:59.632Z" }, + { url = "https://files.pythonhosted.org/packages/44/79/1dcd93ce7070cf01c2ee29f781c42b33c64fce20033808f1cc9ec8413d6e/multidict-6.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f2882bf27037eb687e49591690e5d491e677272964f9ec7bc2abbe09108bdfb8", size = 38451, upload_time = "2025-04-10T22:18:01.202Z" }, + { url = "https://files.pythonhosted.org/packages/f4/35/2292cf29ab5f0d0b3613fad1b75692148959d3834d806be1885ceb49a8ff/multidict-6.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fbf226ac85f7d6b6b9ba77db4ec0704fde88463dc17717aec78ec3c8546c70ad", size = 37706, upload_time = "2025-04-10T22:18:02.276Z" }, + { url = "https://files.pythonhosted.org/packages/f6/d1/6b157110b2b187b5a608b37714acb15ee89ec773e3800315b0107ea648cd/multidict-6.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e329114f82ad4b9dd291bef614ea8971ec119ecd0f54795109976de75c9a852", size = 226669, upload_time = "2025-04-10T22:18:03.436Z" }, + { url = "https://files.pythonhosted.org/packages/40/7f/61a476450651f177c5570e04bd55947f693077ba7804fe9717ee9ae8de04/multidict-6.4.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1f4e0334d7a555c63f5c8952c57ab6f1c7b4f8c7f3442df689fc9f03df315c08", size = 223182, upload_time = "2025-04-10T22:18:04.922Z" }, + { url = "https://files.pythonhosted.org/packages/51/7b/eaf7502ac4824cdd8edcf5723e2e99f390c879866aec7b0c420267b53749/multidict-6.4.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:740915eb776617b57142ce0bb13b7596933496e2f798d3d15a20614adf30d229", size = 235025, upload_time = "2025-04-10T22:18:06.274Z" }, + { url = "https://files.pythonhosted.org/packages/3b/f6/facdbbd73c96b67a93652774edd5778ab1167854fa08ea35ad004b1b70ad/multidict-6.4.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255dac25134d2b141c944b59a0d2f7211ca12a6d4779f7586a98b4b03ea80508", size = 231481, upload_time = "2025-04-10T22:18:07.742Z" }, + { url = "https://files.pythonhosted.org/packages/70/57/c008e861b3052405eebf921fd56a748322d8c44dcfcab164fffbccbdcdc4/multidict-6.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4e8535bd4d741039b5aad4285ecd9b902ef9e224711f0b6afda6e38d7ac02c7", size = 223492, upload_time = "2025-04-10T22:18:09.095Z" }, + { url = "https://files.pythonhosted.org/packages/30/4d/7d8440d3a12a6ae5d6b202d6e7f2ac6ab026e04e99aaf1b73f18e6bc34bc/multidict-6.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c433a33be000dd968f5750722eaa0991037be0be4a9d453eba121774985bc8", size = 217279, upload_time = "2025-04-10T22:18:10.474Z" }, + { url = "https://files.pythonhosted.org/packages/7f/e7/bca0df4dd057597b94138d2d8af04eb3c27396a425b1b0a52e082f9be621/multidict-6.4.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4eb33b0bdc50acd538f45041f5f19945a1f32b909b76d7b117c0c25d8063df56", size = 228733, upload_time = "2025-04-10T22:18:11.793Z" }, + { url = "https://files.pythonhosted.org/packages/88/f5/383827c3f1c38d7c92dbad00a8a041760228573b1c542fbf245c37bbca8a/multidict-6.4.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:75482f43465edefd8a5d72724887ccdcd0c83778ded8f0cb1e0594bf71736cc0", size = 218089, upload_time = "2025-04-10T22:18:13.153Z" }, + { url = "https://files.pythonhosted.org/packages/36/8a/a5174e8a7d8b94b4c8f9c1e2cf5d07451f41368ffe94d05fc957215b8e72/multidict-6.4.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce5b3082e86aee80b3925ab4928198450d8e5b6466e11501fe03ad2191c6d777", size = 225257, upload_time = "2025-04-10T22:18:14.654Z" }, + { url = "https://files.pythonhosted.org/packages/8c/76/1d4b7218f0fd00b8e5c90b88df2e45f8af127f652f4e41add947fa54c1c4/multidict-6.4.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e413152e3212c4d39f82cf83c6f91be44bec9ddea950ce17af87fbf4e32ca6b2", size = 234728, upload_time = "2025-04-10T22:18:16.236Z" }, + { url = "https://files.pythonhosted.org/packages/64/44/18372a4f6273fc7ca25630d7bf9ae288cde64f29593a078bff450c7170b6/multidict-6.4.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8aac2eeff69b71f229a405c0a4b61b54bade8e10163bc7b44fcd257949620618", size = 230087, upload_time = "2025-04-10T22:18:17.979Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/28728c314a698d8a6d9491fcacc897077348ec28dd85884d09e64df8a855/multidict-6.4.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ab583ac203af1d09034be41458feeab7863c0635c650a16f15771e1386abf2d7", size = 223137, upload_time = "2025-04-10T22:18:19.362Z" }, + { url = "https://files.pythonhosted.org/packages/22/50/785bb2b3fe16051bc91c70a06a919f26312da45c34db97fc87441d61e343/multidict-6.4.3-cp311-cp311-win32.whl", hash = "sha256:1b2019317726f41e81154df636a897de1bfe9228c3724a433894e44cd2512378", size = 34959, upload_time = "2025-04-10T22:18:20.728Z" }, + { url = "https://files.pythonhosted.org/packages/2f/63/2a22e099ae2f4d92897618c00c73a09a08a2a9aa14b12736965bf8d59fd3/multidict-6.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:43173924fa93c7486402217fab99b60baf78d33806af299c56133a3755f69589", size = 38541, upload_time = "2025-04-10T22:18:22.001Z" }, + { url = "https://files.pythonhosted.org/packages/fc/bb/3abdaf8fe40e9226ce8a2ba5ecf332461f7beec478a455d6587159f1bf92/multidict-6.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f1c2f58f08b36f8475f3ec6f5aeb95270921d418bf18f90dffd6be5c7b0e676", size = 64019, upload_time = "2025-04-10T22:18:23.174Z" }, + { url = "https://files.pythonhosted.org/packages/7e/b5/1b2e8de8217d2e89db156625aa0fe4a6faad98972bfe07a7b8c10ef5dd6b/multidict-6.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:26ae9ad364fc61b936fb7bf4c9d8bd53f3a5b4417142cd0be5c509d6f767e2f1", size = 37925, upload_time = "2025-04-10T22:18:24.834Z" }, + { url = "https://files.pythonhosted.org/packages/b4/e2/3ca91c112644a395c8eae017144c907d173ea910c913ff8b62549dcf0bbf/multidict-6.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:659318c6c8a85f6ecfc06b4e57529e5a78dfdd697260cc81f683492ad7e9435a", size = 37008, upload_time = "2025-04-10T22:18:26.069Z" }, + { url = "https://files.pythonhosted.org/packages/60/23/79bc78146c7ac8d1ac766b2770ca2e07c2816058b8a3d5da6caed8148637/multidict-6.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1eb72c741fd24d5a28242ce72bb61bc91f8451877131fa3fe930edb195f7054", size = 224374, upload_time = "2025-04-10T22:18:27.714Z" }, + { url = "https://files.pythonhosted.org/packages/86/35/77950ed9ebd09136003a85c1926ba42001ca5be14feb49710e4334ee199b/multidict-6.4.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3cd06d88cb7398252284ee75c8db8e680aa0d321451132d0dba12bc995f0adcc", size = 230869, upload_time = "2025-04-10T22:18:29.162Z" }, + { url = "https://files.pythonhosted.org/packages/49/97/2a33c6e7d90bc116c636c14b2abab93d6521c0c052d24bfcc231cbf7f0e7/multidict-6.4.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4543d8dc6470a82fde92b035a92529317191ce993533c3c0c68f56811164ed07", size = 231949, upload_time = "2025-04-10T22:18:30.679Z" }, + { url = "https://files.pythonhosted.org/packages/56/ce/e9b5d9fcf854f61d6686ada7ff64893a7a5523b2a07da6f1265eaaea5151/multidict-6.4.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30a3ebdc068c27e9d6081fca0e2c33fdf132ecea703a72ea216b81a66860adde", size = 231032, upload_time = "2025-04-10T22:18:32.146Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ac/7ced59dcdfeddd03e601edb05adff0c66d81ed4a5160c443e44f2379eef0/multidict-6.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b038f10e23f277153f86f95c777ba1958bcd5993194fda26a1d06fae98b2f00c", size = 223517, upload_time = "2025-04-10T22:18:33.538Z" }, + { url = "https://files.pythonhosted.org/packages/db/e6/325ed9055ae4e085315193a1b58bdb4d7fc38ffcc1f4975cfca97d015e17/multidict-6.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c605a2b2dc14282b580454b9b5d14ebe0668381a3a26d0ac39daa0ca115eb2ae", size = 216291, upload_time = "2025-04-10T22:18:34.962Z" }, + { url = "https://files.pythonhosted.org/packages/fa/84/eeee6d477dd9dcb7691c3bb9d08df56017f5dd15c730bcc9383dcf201cf4/multidict-6.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8bd2b875f4ca2bb527fe23e318ddd509b7df163407b0fb717df229041c6df5d3", size = 228982, upload_time = "2025-04-10T22:18:36.443Z" }, + { url = "https://files.pythonhosted.org/packages/82/94/4d1f3e74e7acf8b0c85db350e012dcc61701cd6668bc2440bb1ecb423c90/multidict-6.4.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c2e98c840c9c8e65c0e04b40c6c5066c8632678cd50c8721fdbcd2e09f21a507", size = 226823, upload_time = "2025-04-10T22:18:37.924Z" }, + { url = "https://files.pythonhosted.org/packages/09/f0/1e54b95bda7cd01080e5732f9abb7b76ab5cc795b66605877caeb2197476/multidict-6.4.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:66eb80dd0ab36dbd559635e62fba3083a48a252633164857a1d1684f14326427", size = 222714, upload_time = "2025-04-10T22:18:39.807Z" }, + { url = "https://files.pythonhosted.org/packages/e7/a2/f6cbca875195bd65a3e53b37ab46486f3cc125bdeab20eefe5042afa31fb/multidict-6.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c23831bdee0a2a3cf21be057b5e5326292f60472fb6c6f86392bbf0de70ba731", size = 233739, upload_time = "2025-04-10T22:18:41.341Z" }, + { url = "https://files.pythonhosted.org/packages/79/68/9891f4d2b8569554723ddd6154375295f789dc65809826c6fb96a06314fd/multidict-6.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1535cec6443bfd80d028052e9d17ba6ff8a5a3534c51d285ba56c18af97e9713", size = 230809, upload_time = "2025-04-10T22:18:42.817Z" }, + { url = "https://files.pythonhosted.org/packages/e6/72/a7be29ba1e87e4fc5ceb44dabc7940b8005fd2436a332a23547709315f70/multidict-6.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3b73e7227681f85d19dec46e5b881827cd354aabe46049e1a61d2f9aaa4e285a", size = 226934, upload_time = "2025-04-10T22:18:44.311Z" }, + { url = "https://files.pythonhosted.org/packages/12/c1/259386a9ad6840ff7afc686da96808b503d152ac4feb3a96c651dc4f5abf/multidict-6.4.3-cp312-cp312-win32.whl", hash = "sha256:8eac0c49df91b88bf91f818e0a24c1c46f3622978e2c27035bfdca98e0e18124", size = 35242, upload_time = "2025-04-10T22:18:46.193Z" }, + { url = "https://files.pythonhosted.org/packages/06/24/c8fdff4f924d37225dc0c56a28b1dca10728fc2233065fafeb27b4b125be/multidict-6.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:11990b5c757d956cd1db7cb140be50a63216af32cd6506329c2c59d732d802db", size = 38635, upload_time = "2025-04-10T22:18:47.498Z" }, + { url = "https://files.pythonhosted.org/packages/6c/4b/86fd786d03915c6f49998cf10cd5fe6b6ac9e9a071cb40885d2e080fb90d/multidict-6.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a76534263d03ae0cfa721fea40fd2b5b9d17a6f85e98025931d41dc49504474", size = 63831, upload_time = "2025-04-10T22:18:48.748Z" }, + { url = "https://files.pythonhosted.org/packages/45/05/9b51fdf7aef2563340a93be0a663acba2c428c4daeaf3960d92d53a4a930/multidict-6.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:805031c2f599eee62ac579843555ed1ce389ae00c7e9f74c2a1b45e0564a88dd", size = 37888, upload_time = "2025-04-10T22:18:50.021Z" }, + { url = "https://files.pythonhosted.org/packages/0b/43/53fc25394386c911822419b522181227ca450cf57fea76e6188772a1bd91/multidict-6.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c56c179839d5dcf51d565132185409d1d5dd8e614ba501eb79023a6cab25576b", size = 36852, upload_time = "2025-04-10T22:18:51.246Z" }, + { url = "https://files.pythonhosted.org/packages/8a/68/7b99c751e822467c94a235b810a2fd4047d4ecb91caef6b5c60116991c4b/multidict-6.4.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c64f4ddb3886dd8ab71b68a7431ad4aa01a8fa5be5b11543b29674f29ca0ba3", size = 223644, upload_time = "2025-04-10T22:18:52.965Z" }, + { url = "https://files.pythonhosted.org/packages/80/1b/d458d791e4dd0f7e92596667784fbf99e5c8ba040affe1ca04f06b93ae92/multidict-6.4.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3002a856367c0b41cad6784f5b8d3ab008eda194ed7864aaa58f65312e2abcac", size = 230446, upload_time = "2025-04-10T22:18:54.509Z" }, + { url = "https://files.pythonhosted.org/packages/e2/46/9793378d988905491a7806d8987862dc5a0bae8a622dd896c4008c7b226b/multidict-6.4.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d75e621e7d887d539d6e1d789f0c64271c250276c333480a9e1de089611f790", size = 231070, upload_time = "2025-04-10T22:18:56.019Z" }, + { url = "https://files.pythonhosted.org/packages/a7/b8/b127d3e1f8dd2a5bf286b47b24567ae6363017292dc6dec44656e6246498/multidict-6.4.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:995015cf4a3c0d72cbf453b10a999b92c5629eaf3a0c3e1efb4b5c1f602253bb", size = 229956, upload_time = "2025-04-10T22:18:59.146Z" }, + { url = "https://files.pythonhosted.org/packages/0c/93/f70a4c35b103fcfe1443059a2bb7f66e5c35f2aea7804105ff214f566009/multidict-6.4.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2b0fabae7939d09d7d16a711468c385272fa1b9b7fb0d37e51143585d8e72e0", size = 222599, upload_time = "2025-04-10T22:19:00.657Z" }, + { url = "https://files.pythonhosted.org/packages/63/8c/e28e0eb2fe34921d6aa32bfc4ac75b09570b4d6818cc95d25499fe08dc1d/multidict-6.4.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:61ed4d82f8a1e67eb9eb04f8587970d78fe7cddb4e4d6230b77eda23d27938f9", size = 216136, upload_time = "2025-04-10T22:19:02.244Z" }, + { url = "https://files.pythonhosted.org/packages/72/f5/fbc81f866585b05f89f99d108be5d6ad170e3b6c4d0723d1a2f6ba5fa918/multidict-6.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:062428944a8dc69df9fdc5d5fc6279421e5f9c75a9ee3f586f274ba7b05ab3c8", size = 228139, upload_time = "2025-04-10T22:19:04.151Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ba/7d196bad6b85af2307d81f6979c36ed9665f49626f66d883d6c64d156f78/multidict-6.4.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:b90e27b4674e6c405ad6c64e515a505c6d113b832df52fdacb6b1ffd1fa9a1d1", size = 226251, upload_time = "2025-04-10T22:19:06.117Z" }, + { url = "https://files.pythonhosted.org/packages/cc/e2/fae46a370dce79d08b672422a33df721ec8b80105e0ea8d87215ff6b090d/multidict-6.4.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7d50d4abf6729921e9613d98344b74241572b751c6b37feed75fb0c37bd5a817", size = 221868, upload_time = "2025-04-10T22:19:07.981Z" }, + { url = "https://files.pythonhosted.org/packages/26/20/bbc9a3dec19d5492f54a167f08546656e7aef75d181d3d82541463450e88/multidict-6.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:43fe10524fb0a0514be3954be53258e61d87341008ce4914f8e8b92bee6f875d", size = 233106, upload_time = "2025-04-10T22:19:09.5Z" }, + { url = "https://files.pythonhosted.org/packages/ee/8d/f30ae8f5ff7a2461177f4d8eb0d8f69f27fb6cfe276b54ec4fd5a282d918/multidict-6.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:236966ca6c472ea4e2d3f02f6673ebfd36ba3f23159c323f5a496869bc8e47c9", size = 230163, upload_time = "2025-04-10T22:19:11Z" }, + { url = "https://files.pythonhosted.org/packages/15/e9/2833f3c218d3c2179f3093f766940ded6b81a49d2e2f9c46ab240d23dfec/multidict-6.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:422a5ec315018e606473ba1f5431e064cf8b2a7468019233dcf8082fabad64c8", size = 225906, upload_time = "2025-04-10T22:19:12.875Z" }, + { url = "https://files.pythonhosted.org/packages/f1/31/6edab296ac369fd286b845fa5dd4c409e63bc4655ed8c9510fcb477e9ae9/multidict-6.4.3-cp313-cp313-win32.whl", hash = "sha256:f901a5aace8e8c25d78960dcc24c870c8d356660d3b49b93a78bf38eb682aac3", size = 35238, upload_time = "2025-04-10T22:19:14.41Z" }, + { url = "https://files.pythonhosted.org/packages/23/57/2c0167a1bffa30d9a1383c3dab99d8caae985defc8636934b5668830d2ef/multidict-6.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:1c152c49e42277bc9a2f7b78bd5fa10b13e88d1b0328221e7aef89d5c60a99a5", size = 38799, upload_time = "2025-04-10T22:19:15.869Z" }, + { url = "https://files.pythonhosted.org/packages/c9/13/2ead63b9ab0d2b3080819268acb297bd66e238070aa8d42af12b08cbee1c/multidict-6.4.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:be8751869e28b9c0d368d94f5afcb4234db66fe8496144547b4b6d6a0645cfc6", size = 68642, upload_time = "2025-04-10T22:19:17.527Z" }, + { url = "https://files.pythonhosted.org/packages/85/45/f1a751e1eede30c23951e2ae274ce8fad738e8a3d5714be73e0a41b27b16/multidict-6.4.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0d4b31f8a68dccbcd2c0ea04f0e014f1defc6b78f0eb8b35f2265e8716a6df0c", size = 40028, upload_time = "2025-04-10T22:19:19.465Z" }, + { url = "https://files.pythonhosted.org/packages/a7/29/fcc53e886a2cc5595cc4560df333cb9630257bda65003a7eb4e4e0d8f9c1/multidict-6.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:032efeab3049e37eef2ff91271884303becc9e54d740b492a93b7e7266e23756", size = 39424, upload_time = "2025-04-10T22:19:20.762Z" }, + { url = "https://files.pythonhosted.org/packages/f6/f0/056c81119d8b88703971f937b371795cab1407cd3c751482de5bfe1a04a9/multidict-6.4.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e78006af1a7c8a8007e4f56629d7252668344442f66982368ac06522445e375", size = 226178, upload_time = "2025-04-10T22:19:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/a3/79/3b7e5fea0aa80583d3a69c9d98b7913dfd4fbc341fb10bb2fb48d35a9c21/multidict-6.4.3-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:daeac9dd30cda8703c417e4fddccd7c4dc0c73421a0b54a7da2713be125846be", size = 222617, upload_time = "2025-04-10T22:19:23.773Z" }, + { url = "https://files.pythonhosted.org/packages/06/db/3ed012b163e376fc461e1d6a67de69b408339bc31dc83d39ae9ec3bf9578/multidict-6.4.3-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f6f90700881438953eae443a9c6f8a509808bc3b185246992c4233ccee37fea", size = 227919, upload_time = "2025-04-10T22:19:25.35Z" }, + { url = "https://files.pythonhosted.org/packages/b1/db/0433c104bca380989bc04d3b841fc83e95ce0c89f680e9ea4251118b52b6/multidict-6.4.3-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f84627997008390dd15762128dcf73c3365f4ec0106739cde6c20a07ed198ec8", size = 226097, upload_time = "2025-04-10T22:19:27.183Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/910db2618175724dd254b7ae635b6cd8d2947a8b76b0376de7b96d814dab/multidict-6.4.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3307b48cd156153b117c0ea54890a3bdbf858a5b296ddd40dc3852e5f16e9b02", size = 220706, upload_time = "2025-04-10T22:19:28.882Z" }, + { url = "https://files.pythonhosted.org/packages/d1/af/aa176c6f5f1d901aac957d5258d5e22897fe13948d1e69063ae3d5d0ca01/multidict-6.4.3-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ead46b0fa1dcf5af503a46e9f1c2e80b5d95c6011526352fa5f42ea201526124", size = 211728, upload_time = "2025-04-10T22:19:30.481Z" }, + { url = "https://files.pythonhosted.org/packages/e7/42/d51cc5fc1527c3717d7f85137d6c79bb7a93cd214c26f1fc57523774dbb5/multidict-6.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1748cb2743bedc339d63eb1bca314061568793acd603a6e37b09a326334c9f44", size = 226276, upload_time = "2025-04-10T22:19:32.454Z" }, + { url = "https://files.pythonhosted.org/packages/28/6b/d836dea45e0b8432343ba4acf9a8ecaa245da4c0960fb7ab45088a5e568a/multidict-6.4.3-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:acc9fa606f76fc111b4569348cc23a771cb52c61516dcc6bcef46d612edb483b", size = 212069, upload_time = "2025-04-10T22:19:34.17Z" }, + { url = "https://files.pythonhosted.org/packages/55/34/0ee1a7adb3560e18ee9289c6e5f7db54edc312b13e5c8263e88ea373d12c/multidict-6.4.3-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:31469d5832b5885adeb70982e531ce86f8c992334edd2f2254a10fa3182ac504", size = 217858, upload_time = "2025-04-10T22:19:35.879Z" }, + { url = "https://files.pythonhosted.org/packages/04/08/586d652c2f5acefe0cf4e658eedb4d71d4ba6dfd4f189bd81b400fc1bc6b/multidict-6.4.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ba46b51b6e51b4ef7bfb84b82f5db0dc5e300fb222a8a13b8cd4111898a869cf", size = 226988, upload_time = "2025-04-10T22:19:37.434Z" }, + { url = "https://files.pythonhosted.org/packages/82/e3/cc59c7e2bc49d7f906fb4ffb6d9c3a3cf21b9f2dd9c96d05bef89c2b1fd1/multidict-6.4.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:389cfefb599edf3fcfd5f64c0410da686f90f5f5e2c4d84e14f6797a5a337af4", size = 220435, upload_time = "2025-04-10T22:19:39.005Z" }, + { url = "https://files.pythonhosted.org/packages/e0/32/5c3a556118aca9981d883f38c4b1bfae646f3627157f70f4068e5a648955/multidict-6.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:64bc2bbc5fba7b9db5c2c8d750824f41c6994e3882e6d73c903c2afa78d091e4", size = 221494, upload_time = "2025-04-10T22:19:41.447Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3b/1599631f59024b75c4d6e3069f4502409970a336647502aaf6b62fb7ac98/multidict-6.4.3-cp313-cp313t-win32.whl", hash = "sha256:0ecdc12ea44bab2807d6b4a7e5eef25109ab1c82a8240d86d3c1fc9f3b72efd5", size = 41775, upload_time = "2025-04-10T22:19:43.707Z" }, + { url = "https://files.pythonhosted.org/packages/e8/4e/09301668d675d02ca8e8e1a3e6be046619e30403f5ada2ed5b080ae28d02/multidict-6.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7146a8742ea71b5d7d955bffcef58a9e6e04efba704b52a460134fefd10a8208", size = 45946, upload_time = "2025-04-10T22:19:45.071Z" }, + { url = "https://files.pythonhosted.org/packages/96/10/7d526c8974f017f1e7ca584c71ee62a638e9334d8d33f27d7cdfc9ae79e4/multidict-6.4.3-py3-none-any.whl", hash = "sha256:59fe01ee8e2a1e8ceb3f6dbb216b09c8d9f4ef1c22c4fc825d045a147fa2ebc9", size = 10400, upload_time = "2025-04-10T22:20:16.445Z" }, +] + +[[package]] +name = "multiprocess" +version = "0.70.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload_time = "2024-01-28T18:52:34.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload_time = "2024-01-28T18:52:26.062Z" }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload_time = "2024-01-28T18:52:28.115Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload_time = "2024-01-28T18:52:29.395Z" }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload_time = "2024-01-28T18:52:30.853Z" }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload_time = "2024-01-28T18:52:31.981Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload_time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload_time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload_time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload_time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload_time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload_time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload_time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload_time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload_time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload_time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload_time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload_time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload_time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload_time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload_time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload_time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload_time = "2024-02-05T23:58:36.364Z" }, +] + +[[package]] +name = "oauthlib" +version = "3.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6d/fa/fbf4001037904031639e6bfbfc02badfc7e12f137a8afa254df6c4c8a670/oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918", size = 177352, upload_time = "2022-10-17T20:04:27.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/80/cab10959dc1faead58dc8384a781dfbf93cb4d33d50988f7a69f1b7c9bbe/oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", size = 151688, upload_time = "2022-10-17T20:04:24.037Z" }, +] + +[[package]] +name = "openai" +version = "1.60.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/ae/8d9706b8ff2363287b4a8807de2dd29cdbdad5424e9d05d345df724320f5/openai-1.60.2.tar.gz", hash = "sha256:a8f843e10f2855713007f491d96afb2694b11b5e02cb97c7d01a0be60bc5bb51", size = 348185, upload_time = "2025-01-27T19:37:03.72Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/5a/d5474ca67a547dde9b87b5bc8a8f90eadf29f523d410f2ba23d63c9b82ec/openai-1.60.2-py3-none-any.whl", hash = "sha256:993bd11b96900b9098179c728026f016b4982ded7ee30dfcf4555eab1171fff9", size = 456107, upload_time = "2025-01-27T19:37:01.065Z" }, +] + +[[package]] +name = "opt-einsum" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/b9/2ac072041e899a52f20cf9510850ff58295003aa75525e58343591b0cbfb/opt_einsum-3.4.0.tar.gz", hash = "sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac", size = 63004, upload_time = "2024-09-26T14:33:24.483Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/cd/066e86230ae37ed0be70aae89aabf03ca8d9f39c8aea0dec8029455b5540/opt_einsum-3.4.0-py3-none-any.whl", hash = "sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd", size = 71932, upload_time = "2024-09-26T14:33:23.039Z" }, +] + +[[package]] +name = "packaging" +version = "25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload_time = "2025-04-19T11:48:59.673Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload_time = "2025-04-19T11:48:57.875Z" }, +] + +[[package]] +name = "pandas" +version = "2.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213, upload_time = "2024-09-20T13:10:04.827Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222, upload_time = "2024-09-20T13:08:56.254Z" }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274, upload_time = "2024-09-20T13:08:58.645Z" }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836, upload_time = "2024-09-20T19:01:57.571Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505, upload_time = "2024-09-20T13:09:01.501Z" }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420, upload_time = "2024-09-20T19:02:00.678Z" }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457, upload_time = "2024-09-20T13:09:04.105Z" }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166, upload_time = "2024-09-20T13:09:06.917Z" }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893, upload_time = "2024-09-20T13:09:09.655Z" }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475, upload_time = "2024-09-20T13:09:14.718Z" }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645, upload_time = "2024-09-20T19:02:03.88Z" }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445, upload_time = "2024-09-20T13:09:17.621Z" }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235, upload_time = "2024-09-20T19:02:07.094Z" }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756, upload_time = "2024-09-20T13:09:20.474Z" }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248, upload_time = "2024-09-20T13:09:23.137Z" }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643, upload_time = "2024-09-20T13:09:25.522Z" }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573, upload_time = "2024-09-20T13:09:28.012Z" }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085, upload_time = "2024-09-20T19:02:10.451Z" }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809, upload_time = "2024-09-20T13:09:30.814Z" }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316, upload_time = "2024-09-20T19:02:13.825Z" }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055, upload_time = "2024-09-20T13:09:33.462Z" }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175, upload_time = "2024-09-20T13:09:35.871Z" }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650, upload_time = "2024-09-20T13:09:38.685Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177, upload_time = "2024-09-20T13:09:41.141Z" }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526, upload_time = "2024-09-20T19:02:16.905Z" }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013, upload_time = "2024-09-20T13:09:44.39Z" }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620, upload_time = "2024-09-20T19:02:20.639Z" }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload_time = "2024-09-20T13:09:48.112Z" }, +] + +[[package]] +name = "playwright" +version = "1.49.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/be/01025581052e43eb698092c4328d7497ca62bcb5c83f15a611d4a71b4b92/playwright-1.49.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1041ffb45a0d0bc44d698d3a5aa3ac4b67c9bd03540da43a0b70616ad52592b8", size = 39559859, upload_time = "2024-12-10T17:32:14.907Z" }, + { url = "https://files.pythonhosted.org/packages/79/25/ef1010a42cc7d576282015d983c5451d73e369b198b6eb32a177fae281f8/playwright-1.49.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9f38ed3d0c1f4e0a6d1c92e73dd9a61f8855133249d6f0cec28648d38a7137be", size = 38808973, upload_time = "2024-12-10T17:32:22.516Z" }, + { url = "https://files.pythonhosted.org/packages/70/4b/3930cf10f303a10d493a382e4448aaff898b4065698b3b8d92f902e53e08/playwright-1.49.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:3be48c6d26dc819ca0a26567c1ae36a980a0303dcd4249feb6f59e115aaddfb8", size = 39559863, upload_time = "2024-12-10T17:32:29.12Z" }, + { url = "https://files.pythonhosted.org/packages/9a/c1/ea765e72a746dc7ec2ce155ffea29d454e7171db78f3c09185e888387246/playwright-1.49.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:753ca90ee31b4b03d165cfd36e477309ebf2b4381953f2a982ff612d85b147d2", size = 44163300, upload_time = "2024-12-10T17:32:35.647Z" }, + { url = "https://files.pythonhosted.org/packages/5a/52/95efac704bf36b770a2522d88a6dee298042845d10bfb35f7ca0fcc36d91/playwright-1.49.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd9bc8dab37aa25198a01f555f0a2e2c3813fe200fef018ac34dfe86b34994b9", size = 43744353, upload_time = "2024-12-10T17:32:43.189Z" }, + { url = "https://files.pythonhosted.org/packages/f9/97/a3fccc9aaa6da83890772e9980703b0ea6b1e1ad42042fb50df3aef6c641/playwright-1.49.1-py3-none-win32.whl", hash = "sha256:43b304be67f096058e587dac453ece550eff87b8fbed28de30f4f022cc1745bb", size = 34060663, upload_time = "2024-12-10T17:32:49.904Z" }, + { url = "https://files.pythonhosted.org/packages/71/a9/bd88ac0bd498c91aab3aba2e393d1fa59f72a7243e9265ccbf4861ca4f64/playwright-1.49.1-py3-none-win_amd64.whl", hash = "sha256:47b23cb346283278f5b4d1e1990bcb6d6302f80c0aa0ca93dd0601a1400191df", size = 34060667, upload_time = "2024-12-10T17:32:56.459Z" }, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload_time = "2024-04-20T21:34:42.531Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload_time = "2024-04-20T21:34:40.434Z" }, +] + +[[package]] +name = "propcache" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/07/c8/fdc6686a986feae3541ea23dcaa661bd93972d3940460646c6bb96e21c40/propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf", size = 43651, upload_time = "2025-03-26T03:06:12.05Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/0f/5a5319ee83bd651f75311fcb0c492c21322a7fc8f788e4eef23f44243427/propcache-0.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f30241577d2fef2602113b70ef7231bf4c69a97e04693bde08ddab913ba0ce5", size = 80243, upload_time = "2025-03-26T03:04:01.912Z" }, + { url = "https://files.pythonhosted.org/packages/ce/84/3db5537e0879942783e2256616ff15d870a11d7ac26541336fe1b673c818/propcache-0.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:43593c6772aa12abc3af7784bff4a41ffa921608dd38b77cf1dfd7f5c4e71371", size = 46503, upload_time = "2025-03-26T03:04:03.704Z" }, + { url = "https://files.pythonhosted.org/packages/e2/c8/b649ed972433c3f0d827d7f0cf9ea47162f4ef8f4fe98c5f3641a0bc63ff/propcache-0.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a75801768bbe65499495660b777e018cbe90c7980f07f8aa57d6be79ea6f71da", size = 45934, upload_time = "2025-03-26T03:04:05.257Z" }, + { url = "https://files.pythonhosted.org/packages/59/f9/4c0a5cf6974c2c43b1a6810c40d889769cc8f84cea676cbe1e62766a45f8/propcache-0.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f1324db48f001c2ca26a25fa25af60711e09b9aaf4b28488602776f4f9a744", size = 233633, upload_time = "2025-03-26T03:04:07.044Z" }, + { url = "https://files.pythonhosted.org/packages/e7/64/66f2f4d1b4f0007c6e9078bd95b609b633d3957fe6dd23eac33ebde4b584/propcache-0.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cdb0f3e1eb6dfc9965d19734d8f9c481b294b5274337a8cb5cb01b462dcb7e0", size = 241124, upload_time = "2025-03-26T03:04:08.676Z" }, + { url = "https://files.pythonhosted.org/packages/aa/bf/7b8c9fd097d511638fa9b6af3d986adbdf567598a567b46338c925144c1b/propcache-0.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1eb34d90aac9bfbced9a58b266f8946cb5935869ff01b164573a7634d39fbcb5", size = 240283, upload_time = "2025-03-26T03:04:10.172Z" }, + { url = "https://files.pythonhosted.org/packages/fa/c9/e85aeeeaae83358e2a1ef32d6ff50a483a5d5248bc38510d030a6f4e2816/propcache-0.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35c7070eeec2cdaac6fd3fe245226ed2a6292d3ee8c938e5bb645b434c5f256", size = 232498, upload_time = "2025-03-26T03:04:11.616Z" }, + { url = "https://files.pythonhosted.org/packages/8e/66/acb88e1f30ef5536d785c283af2e62931cb934a56a3ecf39105887aa8905/propcache-0.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b23c11c2c9e6d4e7300c92e022046ad09b91fd00e36e83c44483df4afa990073", size = 221486, upload_time = "2025-03-26T03:04:13.102Z" }, + { url = "https://files.pythonhosted.org/packages/f5/f9/233ddb05ffdcaee4448508ee1d70aa7deff21bb41469ccdfcc339f871427/propcache-0.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3e19ea4ea0bf46179f8a3652ac1426e6dcbaf577ce4b4f65be581e237340420d", size = 222675, upload_time = "2025-03-26T03:04:14.658Z" }, + { url = "https://files.pythonhosted.org/packages/98/b8/eb977e28138f9e22a5a789daf608d36e05ed93093ef12a12441030da800a/propcache-0.3.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:bd39c92e4c8f6cbf5f08257d6360123af72af9f4da75a690bef50da77362d25f", size = 215727, upload_time = "2025-03-26T03:04:16.207Z" }, + { url = "https://files.pythonhosted.org/packages/89/2d/5f52d9c579f67b8ee1edd9ec073c91b23cc5b7ff7951a1e449e04ed8fdf3/propcache-0.3.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b0313e8b923b3814d1c4a524c93dfecea5f39fa95601f6a9b1ac96cd66f89ea0", size = 217878, upload_time = "2025-03-26T03:04:18.11Z" }, + { url = "https://files.pythonhosted.org/packages/7a/fd/5283e5ed8a82b00c7a989b99bb6ea173db1ad750bf0bf8dff08d3f4a4e28/propcache-0.3.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e861ad82892408487be144906a368ddbe2dc6297074ade2d892341b35c59844a", size = 230558, upload_time = "2025-03-26T03:04:19.562Z" }, + { url = "https://files.pythonhosted.org/packages/90/38/ab17d75938ef7ac87332c588857422ae126b1c76253f0f5b1242032923ca/propcache-0.3.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:61014615c1274df8da5991a1e5da85a3ccb00c2d4701ac6f3383afd3ca47ab0a", size = 233754, upload_time = "2025-03-26T03:04:21.065Z" }, + { url = "https://files.pythonhosted.org/packages/06/5d/3b921b9c60659ae464137508d3b4c2b3f52f592ceb1964aa2533b32fcf0b/propcache-0.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:71ebe3fe42656a2328ab08933d420df5f3ab121772eef78f2dc63624157f0ed9", size = 226088, upload_time = "2025-03-26T03:04:22.718Z" }, + { url = "https://files.pythonhosted.org/packages/54/6e/30a11f4417d9266b5a464ac5a8c5164ddc9dd153dfa77bf57918165eb4ae/propcache-0.3.1-cp311-cp311-win32.whl", hash = "sha256:58aa11f4ca8b60113d4b8e32d37e7e78bd8af4d1a5b5cb4979ed856a45e62005", size = 40859, upload_time = "2025-03-26T03:04:24.039Z" }, + { url = "https://files.pythonhosted.org/packages/1d/3a/8a68dd867da9ca2ee9dfd361093e9cb08cb0f37e5ddb2276f1b5177d7731/propcache-0.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:9532ea0b26a401264b1365146c440a6d78269ed41f83f23818d4b79497aeabe7", size = 45153, upload_time = "2025-03-26T03:04:25.211Z" }, + { url = "https://files.pythonhosted.org/packages/41/aa/ca78d9be314d1e15ff517b992bebbed3bdfef5b8919e85bf4940e57b6137/propcache-0.3.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f78eb8422acc93d7b69964012ad7048764bb45a54ba7a39bb9e146c72ea29723", size = 80430, upload_time = "2025-03-26T03:04:26.436Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d8/f0c17c44d1cda0ad1979af2e593ea290defdde9eaeb89b08abbe02a5e8e1/propcache-0.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:89498dd49c2f9a026ee057965cdf8192e5ae070ce7d7a7bd4b66a8e257d0c976", size = 46637, upload_time = "2025-03-26T03:04:27.932Z" }, + { url = "https://files.pythonhosted.org/packages/ae/bd/c1e37265910752e6e5e8a4c1605d0129e5b7933c3dc3cf1b9b48ed83b364/propcache-0.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:09400e98545c998d57d10035ff623266927cb784d13dd2b31fd33b8a5316b85b", size = 46123, upload_time = "2025-03-26T03:04:30.659Z" }, + { url = "https://files.pythonhosted.org/packages/d4/b0/911eda0865f90c0c7e9f0415d40a5bf681204da5fd7ca089361a64c16b28/propcache-0.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8efd8c5adc5a2c9d3b952815ff8f7710cefdcaf5f2c36d26aff51aeca2f12f", size = 243031, upload_time = "2025-03-26T03:04:31.977Z" }, + { url = "https://files.pythonhosted.org/packages/0a/06/0da53397c76a74271621807265b6eb61fb011451b1ddebf43213df763669/propcache-0.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2fe5c910f6007e716a06d269608d307b4f36e7babee5f36533722660e8c4a70", size = 249100, upload_time = "2025-03-26T03:04:33.45Z" }, + { url = "https://files.pythonhosted.org/packages/f1/eb/13090e05bf6b963fc1653cdc922133ced467cb4b8dab53158db5a37aa21e/propcache-0.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a0ab8cf8cdd2194f8ff979a43ab43049b1df0b37aa64ab7eca04ac14429baeb7", size = 250170, upload_time = "2025-03-26T03:04:35.542Z" }, + { url = "https://files.pythonhosted.org/packages/3b/4c/f72c9e1022b3b043ec7dc475a0f405d4c3e10b9b1d378a7330fecf0652da/propcache-0.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563f9d8c03ad645597b8d010ef4e9eab359faeb11a0a2ac9f7b4bc8c28ebef25", size = 245000, upload_time = "2025-03-26T03:04:37.501Z" }, + { url = "https://files.pythonhosted.org/packages/e8/fd/970ca0e22acc829f1adf5de3724085e778c1ad8a75bec010049502cb3a86/propcache-0.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb6e0faf8cb6b4beea5d6ed7b5a578254c6d7df54c36ccd3d8b3eb00d6770277", size = 230262, upload_time = "2025-03-26T03:04:39.532Z" }, + { url = "https://files.pythonhosted.org/packages/c4/42/817289120c6b9194a44f6c3e6b2c3277c5b70bbad39e7df648f177cc3634/propcache-0.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1c5c7ab7f2bb3f573d1cb921993006ba2d39e8621019dffb1c5bc94cdbae81e8", size = 236772, upload_time = "2025-03-26T03:04:41.109Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9c/3b3942b302badd589ad6b672da3ca7b660a6c2f505cafd058133ddc73918/propcache-0.3.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:050b571b2e96ec942898f8eb46ea4bfbb19bd5502424747e83badc2d4a99a44e", size = 231133, upload_time = "2025-03-26T03:04:42.544Z" }, + { url = "https://files.pythonhosted.org/packages/98/a1/75f6355f9ad039108ff000dfc2e19962c8dea0430da9a1428e7975cf24b2/propcache-0.3.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e1c4d24b804b3a87e9350f79e2371a705a188d292fd310e663483af6ee6718ee", size = 230741, upload_time = "2025-03-26T03:04:44.06Z" }, + { url = "https://files.pythonhosted.org/packages/67/0c/3e82563af77d1f8731132166da69fdfd95e71210e31f18edce08a1eb11ea/propcache-0.3.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e4fe2a6d5ce975c117a6bb1e8ccda772d1e7029c1cca1acd209f91d30fa72815", size = 244047, upload_time = "2025-03-26T03:04:45.983Z" }, + { url = "https://files.pythonhosted.org/packages/f7/50/9fb7cca01532a08c4d5186d7bb2da6c4c587825c0ae134b89b47c7d62628/propcache-0.3.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:feccd282de1f6322f56f6845bf1207a537227812f0a9bf5571df52bb418d79d5", size = 246467, upload_time = "2025-03-26T03:04:47.699Z" }, + { url = "https://files.pythonhosted.org/packages/a9/02/ccbcf3e1c604c16cc525309161d57412c23cf2351523aedbb280eb7c9094/propcache-0.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ec314cde7314d2dd0510c6787326bbffcbdc317ecee6b7401ce218b3099075a7", size = 241022, upload_time = "2025-03-26T03:04:49.195Z" }, + { url = "https://files.pythonhosted.org/packages/db/19/e777227545e09ca1e77a6e21274ae9ec45de0f589f0ce3eca2a41f366220/propcache-0.3.1-cp312-cp312-win32.whl", hash = "sha256:7d2d5a0028d920738372630870e7d9644ce437142197f8c827194fca404bf03b", size = 40647, upload_time = "2025-03-26T03:04:50.595Z" }, + { url = "https://files.pythonhosted.org/packages/24/bb/3b1b01da5dd04c77a204c84e538ff11f624e31431cfde7201d9110b092b1/propcache-0.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:88c423efef9d7a59dae0614eaed718449c09a5ac79a5f224a8b9664d603f04a3", size = 44784, upload_time = "2025-03-26T03:04:51.791Z" }, + { url = "https://files.pythonhosted.org/packages/58/60/f645cc8b570f99be3cf46714170c2de4b4c9d6b827b912811eff1eb8a412/propcache-0.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f1528ec4374617a7a753f90f20e2f551121bb558fcb35926f99e3c42367164b8", size = 77865, upload_time = "2025-03-26T03:04:53.406Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d4/c1adbf3901537582e65cf90fd9c26fde1298fde5a2c593f987112c0d0798/propcache-0.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc1915ec523b3b494933b5424980831b636fe483d7d543f7afb7b3bf00f0c10f", size = 45452, upload_time = "2025-03-26T03:04:54.624Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b5/fe752b2e63f49f727c6c1c224175d21b7d1727ce1d4873ef1c24c9216830/propcache-0.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a110205022d077da24e60b3df8bcee73971be9575dec5573dd17ae5d81751111", size = 44800, upload_time = "2025-03-26T03:04:55.844Z" }, + { url = "https://files.pythonhosted.org/packages/62/37/fc357e345bc1971e21f76597028b059c3d795c5ca7690d7a8d9a03c9708a/propcache-0.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d249609e547c04d190e820d0d4c8ca03ed4582bcf8e4e160a6969ddfb57b62e5", size = 225804, upload_time = "2025-03-26T03:04:57.158Z" }, + { url = "https://files.pythonhosted.org/packages/0d/f1/16e12c33e3dbe7f8b737809bad05719cff1dccb8df4dafbcff5575002c0e/propcache-0.3.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ced33d827625d0a589e831126ccb4f5c29dfdf6766cac441d23995a65825dcb", size = 230650, upload_time = "2025-03-26T03:04:58.61Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a2/018b9f2ed876bf5091e60153f727e8f9073d97573f790ff7cdf6bc1d1fb8/propcache-0.3.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4114c4ada8f3181af20808bedb250da6bae56660e4b8dfd9cd95d4549c0962f7", size = 234235, upload_time = "2025-03-26T03:05:00.599Z" }, + { url = "https://files.pythonhosted.org/packages/45/5f/3faee66fc930dfb5da509e34c6ac7128870631c0e3582987fad161fcb4b1/propcache-0.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:975af16f406ce48f1333ec5e912fe11064605d5c5b3f6746969077cc3adeb120", size = 228249, upload_time = "2025-03-26T03:05:02.11Z" }, + { url = "https://files.pythonhosted.org/packages/62/1e/a0d5ebda5da7ff34d2f5259a3e171a94be83c41eb1e7cd21a2105a84a02e/propcache-0.3.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a34aa3a1abc50740be6ac0ab9d594e274f59960d3ad253cd318af76b996dd654", size = 214964, upload_time = "2025-03-26T03:05:03.599Z" }, + { url = "https://files.pythonhosted.org/packages/db/a0/d72da3f61ceab126e9be1f3bc7844b4e98c6e61c985097474668e7e52152/propcache-0.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9cec3239c85ed15bfaded997773fdad9fb5662b0a7cbc854a43f291eb183179e", size = 222501, upload_time = "2025-03-26T03:05:05.107Z" }, + { url = "https://files.pythonhosted.org/packages/18/6d/a008e07ad7b905011253adbbd97e5b5375c33f0b961355ca0a30377504ac/propcache-0.3.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:05543250deac8e61084234d5fc54f8ebd254e8f2b39a16b1dce48904f45b744b", size = 217917, upload_time = "2025-03-26T03:05:06.59Z" }, + { url = "https://files.pythonhosted.org/packages/98/37/02c9343ffe59e590e0e56dc5c97d0da2b8b19fa747ebacf158310f97a79a/propcache-0.3.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5cb5918253912e088edbf023788de539219718d3b10aef334476b62d2b53de53", size = 217089, upload_time = "2025-03-26T03:05:08.1Z" }, + { url = "https://files.pythonhosted.org/packages/53/1b/d3406629a2c8a5666d4674c50f757a77be119b113eedd47b0375afdf1b42/propcache-0.3.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f3bbecd2f34d0e6d3c543fdb3b15d6b60dd69970c2b4c822379e5ec8f6f621d5", size = 228102, upload_time = "2025-03-26T03:05:09.982Z" }, + { url = "https://files.pythonhosted.org/packages/cd/a7/3664756cf50ce739e5f3abd48febc0be1a713b1f389a502ca819791a6b69/propcache-0.3.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aca63103895c7d960a5b9b044a83f544b233c95e0dcff114389d64d762017af7", size = 230122, upload_time = "2025-03-26T03:05:11.408Z" }, + { url = "https://files.pythonhosted.org/packages/35/36/0bbabaacdcc26dac4f8139625e930f4311864251276033a52fd52ff2a274/propcache-0.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a0a9898fdb99bf11786265468571e628ba60af80dc3f6eb89a3545540c6b0ef", size = 226818, upload_time = "2025-03-26T03:05:12.909Z" }, + { url = "https://files.pythonhosted.org/packages/cc/27/4e0ef21084b53bd35d4dae1634b6d0bad35e9c58ed4f032511acca9d4d26/propcache-0.3.1-cp313-cp313-win32.whl", hash = "sha256:3a02a28095b5e63128bcae98eb59025924f121f048a62393db682f049bf4ac24", size = 40112, upload_time = "2025-03-26T03:05:14.289Z" }, + { url = "https://files.pythonhosted.org/packages/a6/2c/a54614d61895ba6dd7ac8f107e2b2a0347259ab29cbf2ecc7b94fa38c4dc/propcache-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:813fbb8b6aea2fc9659815e585e548fe706d6f663fa73dff59a1677d4595a037", size = 44034, upload_time = "2025-03-26T03:05:15.616Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a8/0a4fd2f664fc6acc66438370905124ce62e84e2e860f2557015ee4a61c7e/propcache-0.3.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a444192f20f5ce8a5e52761a031b90f5ea6288b1eef42ad4c7e64fef33540b8f", size = 82613, upload_time = "2025-03-26T03:05:16.913Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e5/5ef30eb2cd81576256d7b6caaa0ce33cd1d2c2c92c8903cccb1af1a4ff2f/propcache-0.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fbe94666e62ebe36cd652f5fc012abfbc2342de99b523f8267a678e4dfdee3c", size = 47763, upload_time = "2025-03-26T03:05:18.607Z" }, + { url = "https://files.pythonhosted.org/packages/87/9a/87091ceb048efeba4d28e903c0b15bcc84b7c0bf27dc0261e62335d9b7b8/propcache-0.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f011f104db880f4e2166bcdcf7f58250f7a465bc6b068dc84c824a3d4a5c94dc", size = 47175, upload_time = "2025-03-26T03:05:19.85Z" }, + { url = "https://files.pythonhosted.org/packages/3e/2f/854e653c96ad1161f96194c6678a41bbb38c7947d17768e8811a77635a08/propcache-0.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e584b6d388aeb0001d6d5c2bd86b26304adde6d9bb9bfa9c4889805021b96de", size = 292265, upload_time = "2025-03-26T03:05:21.654Z" }, + { url = "https://files.pythonhosted.org/packages/40/8d/090955e13ed06bc3496ba4a9fb26c62e209ac41973cb0d6222de20c6868f/propcache-0.3.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a17583515a04358b034e241f952f1715243482fc2c2945fd99a1b03a0bd77d6", size = 294412, upload_time = "2025-03-26T03:05:23.147Z" }, + { url = "https://files.pythonhosted.org/packages/39/e6/d51601342e53cc7582449e6a3c14a0479fab2f0750c1f4d22302e34219c6/propcache-0.3.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5aed8d8308215089c0734a2af4f2e95eeb360660184ad3912686c181e500b2e7", size = 294290, upload_time = "2025-03-26T03:05:24.577Z" }, + { url = "https://files.pythonhosted.org/packages/3b/4d/be5f1a90abc1881884aa5878989a1acdafd379a91d9c7e5e12cef37ec0d7/propcache-0.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8e309ff9a0503ef70dc9a0ebd3e69cf7b3894c9ae2ae81fc10943c37762458", size = 282926, upload_time = "2025-03-26T03:05:26.459Z" }, + { url = "https://files.pythonhosted.org/packages/57/2b/8f61b998c7ea93a2b7eca79e53f3e903db1787fca9373af9e2cf8dc22f9d/propcache-0.3.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b655032b202028a582d27aeedc2e813299f82cb232f969f87a4fde491a233f11", size = 267808, upload_time = "2025-03-26T03:05:28.188Z" }, + { url = "https://files.pythonhosted.org/packages/11/1c/311326c3dfce59c58a6098388ba984b0e5fb0381ef2279ec458ef99bd547/propcache-0.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f64d91b751df77931336b5ff7bafbe8845c5770b06630e27acd5dbb71e1931c", size = 290916, upload_time = "2025-03-26T03:05:29.757Z" }, + { url = "https://files.pythonhosted.org/packages/4b/74/91939924b0385e54dc48eb2e4edd1e4903ffd053cf1916ebc5347ac227f7/propcache-0.3.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:19a06db789a4bd896ee91ebc50d059e23b3639c25d58eb35be3ca1cbe967c3bf", size = 262661, upload_time = "2025-03-26T03:05:31.472Z" }, + { url = "https://files.pythonhosted.org/packages/c2/d7/e6079af45136ad325c5337f5dd9ef97ab5dc349e0ff362fe5c5db95e2454/propcache-0.3.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:bef100c88d8692864651b5f98e871fb090bd65c8a41a1cb0ff2322db39c96c27", size = 264384, upload_time = "2025-03-26T03:05:32.984Z" }, + { url = "https://files.pythonhosted.org/packages/b7/d5/ba91702207ac61ae6f1c2da81c5d0d6bf6ce89e08a2b4d44e411c0bbe867/propcache-0.3.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:87380fb1f3089d2a0b8b00f006ed12bd41bd858fabfa7330c954c70f50ed8757", size = 291420, upload_time = "2025-03-26T03:05:34.496Z" }, + { url = "https://files.pythonhosted.org/packages/58/70/2117780ed7edcd7ba6b8134cb7802aada90b894a9810ec56b7bb6018bee7/propcache-0.3.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e474fc718e73ba5ec5180358aa07f6aded0ff5f2abe700e3115c37d75c947e18", size = 290880, upload_time = "2025-03-26T03:05:36.256Z" }, + { url = "https://files.pythonhosted.org/packages/4a/1f/ecd9ce27710021ae623631c0146719280a929d895a095f6d85efb6a0be2e/propcache-0.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:17d1c688a443355234f3c031349da69444be052613483f3e4158eef751abcd8a", size = 287407, upload_time = "2025-03-26T03:05:37.799Z" }, + { url = "https://files.pythonhosted.org/packages/3e/66/2e90547d6b60180fb29e23dc87bd8c116517d4255240ec6d3f7dc23d1926/propcache-0.3.1-cp313-cp313t-win32.whl", hash = "sha256:359e81a949a7619802eb601d66d37072b79b79c2505e6d3fd8b945538411400d", size = 42573, upload_time = "2025-03-26T03:05:39.193Z" }, + { url = "https://files.pythonhosted.org/packages/cb/8f/50ad8599399d1861b4d2b6b45271f0ef6af1b09b0a2386a46dbaf19c9535/propcache-0.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e7fb9a84c9abbf2b2683fa3e7b0d7da4d8ecf139a1c635732a8bda29c5214b0e", size = 46757, upload_time = "2025-03-26T03:05:40.811Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376, upload_time = "2025-03-26T03:06:10.5Z" }, +] + +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload_time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload_time = "2025-03-10T15:54:37.335Z" }, +] + +[[package]] +name = "protobuf" +version = "4.25.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/74/63/84fdeac1f03864c2b8b9f0b7fe711c4af5f95759ee281d2026530086b2f5/protobuf-4.25.7.tar.gz", hash = "sha256:28f65ae8c14523cc2c76c1e91680958700d3eac69f45c96512c12c63d9a38807", size = 380612, upload_time = "2025-04-24T02:56:58.685Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/ed/9a58076cfb8edc237c92617f1d3744660e9b4457d54f3c2fdf1a4bbae5c7/protobuf-4.25.7-cp310-abi3-win32.whl", hash = "sha256:dc582cf1a73a6b40aa8e7704389b8d8352da616bc8ed5c6cc614bdd0b5ce3f7a", size = 392457, upload_time = "2025-04-24T02:56:40.798Z" }, + { url = "https://files.pythonhosted.org/packages/28/b3/e00870528029fe252cf3bd6fa535821c276db3753b44a4691aee0d52ff9e/protobuf-4.25.7-cp310-abi3-win_amd64.whl", hash = "sha256:cd873dbddb28460d1706ff4da2e7fac175f62f2a0bebc7b33141f7523c5a2399", size = 413446, upload_time = "2025-04-24T02:56:44.199Z" }, + { url = "https://files.pythonhosted.org/packages/60/1d/f450a193f875a20099d4492d2c1cb23091d65d512956fb1e167ee61b4bf0/protobuf-4.25.7-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:4c899f09b0502eb39174c717ccf005b844ea93e31137c167ddcacf3e09e49610", size = 394248, upload_time = "2025-04-24T02:56:45.75Z" }, + { url = "https://files.pythonhosted.org/packages/c8/b8/ea88e9857484a0618c74121618b9e620fc50042de43cdabbebe1b93a83e0/protobuf-4.25.7-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:6d2f5dede3d112e573f0e5f9778c0c19d9f9e209727abecae1d39db789f522c6", size = 293717, upload_time = "2025-04-24T02:56:47.427Z" }, + { url = "https://files.pythonhosted.org/packages/a7/81/d0b68e9a9a76804113b6dedc6fffed868b97048bbe6f1bedc675bdb8523c/protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:d41fb7ae72a25fcb79b2d71e4247f0547a02e8185ed51587c22827a87e5736ed", size = 294636, upload_time = "2025-04-24T02:56:48.976Z" }, + { url = "https://files.pythonhosted.org/packages/17/d7/1e7c80cb2ea2880cfe38580dcfbb22b78b746640c9c13fc3337a6967dc4c/protobuf-4.25.7-py3-none-any.whl", hash = "sha256:e9d969f5154eaeab41404def5dcf04e62162178f4b9de98b2d3c1c70f5f84810", size = 156468, upload_time = "2025-04-24T02:56:56.957Z" }, +] + +[[package]] +name = "psycopg" +version = "3.1.20" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5c/6d/0939210f3ba089b360cf0d3741494719152567bc81303cca2c0f1e67c78a/psycopg-3.1.20.tar.gz", hash = "sha256:32f5862ab79f238496236f97fe374a7ab55b4b4bb839a74802026544735f9a07", size = 147567, upload_time = "2024-06-30T17:03:55.421Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/e9/126bbfd5dded758bb109526c5f5f2c2538fe293b15b6fa208db7078c72c4/psycopg-3.1.20-py3-none-any.whl", hash = "sha256:898a29f49ac9c903d554f5a6cdc44a8fc564325557c18f82e51f39c1f4fc2aeb", size = 179473, upload_time = "2024-06-30T16:57:04.093Z" }, +] + +[package.optional-dependencies] +binary = [ + { name = "psycopg-binary", marker = "implementation_name != 'pypy'" }, +] + +[[package]] +name = "psycopg-binary" +version = "3.1.20" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/1c/45e5f240765e80076b08c3ed02c5dfeb5e97d549769b81f8382485d70a15/psycopg_binary-3.1.20-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:802989350fcbc783732bfef660afb34439a62727642a05e8bb9acf7d68993627", size = 3350503, upload_time = "2024-06-30T16:58:27.18Z" }, + { url = "https://files.pythonhosted.org/packages/52/b8/acf96d388692d0bbf2346286f8b175778bc24046aca9181f50d9df9f4714/psycopg_binary-3.1.20-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:01b0e39128715fc37fed6cdc50ab58278eacb75709af503eb607654030975f09", size = 3480091, upload_time = "2024-06-30T16:58:33.872Z" }, + { url = "https://files.pythonhosted.org/packages/41/d4/20604282ff08823d0e90cf092738ea21b339f56a172d8583565b272fc4be/psycopg_binary-3.1.20-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77af1086bedfa0729465565c636de3519079ba523d7b7ee6e8b9486beb1ee905", size = 4434555, upload_time = "2024-06-30T16:58:40.795Z" }, + { url = "https://files.pythonhosted.org/packages/73/e0/3917b766508bb749e08225492d45ba7463b559de1c8a41d3f8f3cf0927cb/psycopg_binary-3.1.20-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9b9562395d441e225f354e8c6303ee6993a93aaeb0dbb5b94368f3249ab2388", size = 4231402, upload_time = "2024-06-30T16:58:48.586Z" }, + { url = "https://files.pythonhosted.org/packages/b4/9b/251435896f7459beda355ef3e3919b6b20d067582cd6838ba248d3cff188/psycopg_binary-3.1.20-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e814d69e5447a93e7b98117ec95a8ce606d3742092fd120960551ed67c376fea", size = 4484218, upload_time = "2024-06-30T16:58:56.911Z" }, + { url = "https://files.pythonhosted.org/packages/a1/12/b2057f9bb8b5f408139266a5b48bfd7578340296d7314d964b9f09e5b18f/psycopg_binary-3.1.20-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adf1c2061600235ae9b11d7ad357cab89ac583a76bdb0199f7a29ac947939c20", size = 4176668, upload_time = "2024-06-30T16:59:02.496Z" }, + { url = "https://files.pythonhosted.org/packages/80/9c/a62fe4167427a06e69882d274ba90903507afc89caf6bcc3671790a20875/psycopg_binary-3.1.20-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:50f1d807b4167f973a6f67bca39bf656b737f7426be158a1dc9cb0000d020744", size = 3102502, upload_time = "2024-06-30T16:59:07.216Z" }, + { url = "https://files.pythonhosted.org/packages/98/83/bceca23dd830d4069949e70dec9feb03c114cc551b104f0e2b48b1e598c6/psycopg_binary-3.1.20-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4cf6ec1490232a5b208dae94a8269dc739e6762684c8658a0f3570402db934ae", size = 3080005, upload_time = "2024-06-30T16:59:14.927Z" }, + { url = "https://files.pythonhosted.org/packages/fc/83/bab7c8495e0eb11bf710663afb2849c2d3c91a2bf61b2bd597941f57f80b/psycopg_binary-3.1.20-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:309c09ec50a9c5c8492c2922ee666df1e30a08b08a9b63083d0daa414eccd09c", size = 3182315, upload_time = "2024-06-30T16:59:21.18Z" }, + { url = "https://files.pythonhosted.org/packages/ca/9b/bd4970faed24ae4a850ee8c6ebd621e98fd86e2962e13038603a726e2504/psycopg_binary-3.1.20-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e2c33a01799f93ef8c11a023df66280e39ca3c3249a2581adb2a0e5e80801088", size = 3222552, upload_time = "2024-06-30T16:59:27.663Z" }, + { url = "https://files.pythonhosted.org/packages/5d/0b/7ab0744f282df53968f5066d5fd8bf3f994f90bf2a8003ab40278818d0f2/psycopg_binary-3.1.20-cp311-cp311-win_amd64.whl", hash = "sha256:2c67532057fda72579b02d9d61e9cc8975982844bd5c3c9dc7f84ce8bcac859c", size = 2899115, upload_time = "2024-06-30T16:59:35.512Z" }, + { url = "https://files.pythonhosted.org/packages/94/12/6e909d3a20f7bfa6915c1fdf64ab47bb9ca44b837adb468841aad51bab6c/psycopg_binary-3.1.20-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ef08de60f1b8503a6f6b6f5bee612de36373c09bc0e3f84409fab09e1ff72107", size = 3326944, upload_time = "2024-06-30T16:59:41.783Z" }, + { url = "https://files.pythonhosted.org/packages/e1/4e/dc425f5c8c102045486f2fa39c3cb379b073557d6bd2cf5d06de81036d7c/psycopg_binary-3.1.20-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a4847fa31c8d3a6dd3536cf1e130dfcc454ed26be471ef274e4358bf7f709cda", size = 3475444, upload_time = "2024-06-30T16:59:48.547Z" }, + { url = "https://files.pythonhosted.org/packages/cd/cd/6484cbdb82dc29bfe43ae8c401a0be309402c304d1aaabcccf1e21908663/psycopg_binary-3.1.20-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b72e9c8c79dcc30e34e996079cfe0374b7c7233d2b5f6f25a0bc8872fe2babef", size = 4412872, upload_time = "2024-06-30T16:59:54.853Z" }, + { url = "https://files.pythonhosted.org/packages/25/d3/d403dc61f9d8b56683a6a1db47ab156807d2e1c442b044fba5763e786893/psycopg_binary-3.1.20-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:836246f3c486ef7edfce6cf6cc760173e244826ebecd54c1b63c91d4cc0341f7", size = 4216654, upload_time = "2024-06-30T16:59:58.935Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ff/389198638ad10ec0e80fcc97b5c8092987214d9ac529b1224bf0f7e221da/psycopg_binary-3.1.20-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:015f70b17539ec0ecfb0f87bcaface0c7fa1289b6e7e2313dc7cdfdc513e3235", size = 4451310, upload_time = "2024-06-30T17:00:05.647Z" }, + { url = "https://files.pythonhosted.org/packages/84/94/9ae70af00caf9ce98f857a883ff64c5d236dfea5b7b4b8528d28e80515aa/psycopg_binary-3.1.20-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f52498dc7b41fee74e971823ede4519e3a9597d416f7a2044dbe4b98cc61ff35", size = 4153667, upload_time = "2024-06-30T17:00:12.309Z" }, + { url = "https://files.pythonhosted.org/packages/b8/57/b8a34174803683ef0f3f2fe18304f7048d31bab431f21cf511598b894ed7/psycopg_binary-3.1.20-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:92b61bae0ac881580faa1c89bf2167db7041cb01cc0bd686244f9c20a010036a", size = 3081906, upload_time = "2024-06-30T17:00:17.223Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e7/5df8c4794f13004787cd7ddfe456eec90f49d1b99f1a10947f7ba2a67487/psycopg_binary-3.1.20-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3532b8677666aadb64a4e31f6e97fe4ab71b862ab100d337faf497198339fd4d", size = 3061376, upload_time = "2024-06-30T17:00:22.232Z" }, + { url = "https://files.pythonhosted.org/packages/8e/c6/ec4abb814f54af4b659896ce10386be0c538dad8111b3daeaf672b4daa03/psycopg_binary-3.1.20-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f7df27f50a7db84c28e58be3df41f39618161096c3379ad68bc665a454c53e93", size = 3150174, upload_time = "2024-06-30T17:00:26.982Z" }, + { url = "https://files.pythonhosted.org/packages/0c/50/7b4382e5f5d256ac720ee0bd6470c7aa7d28f78570bd44d5e0b1c29eeb96/psycopg_binary-3.1.20-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:12b33c511f0be79d5a68231a10972ef9c68d954d30d176679472057ecc22891a", size = 3198871, upload_time = "2024-06-30T17:00:32.17Z" }, + { url = "https://files.pythonhosted.org/packages/76/2f/eda1b86c01d2803ac05714b94283af1e5012437dcc63dfe0679cc4d445ad/psycopg_binary-3.1.20-cp312-cp312-win_amd64.whl", hash = "sha256:6f3c0b05fc3cbd4d99aaacf5c7afa13b086df5777b9fefb78d31bf81fc70bd04", size = 2884414, upload_time = "2024-06-30T17:00:40.26Z" }, +] + +[[package]] +name = "psycopg2-binary" +version = "2.9.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cb/0e/bdc8274dc0585090b4e3432267d7be4dfbfd8971c0fa59167c711105a6bf/psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2", size = 385764, upload_time = "2024-10-16T11:24:58.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/8f/9feb01291d0d7a0a4c6a6bab24094135c2b59c6a81943752f632c75896d6/psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff", size = 3043397, upload_time = "2024-10-16T11:19:40.033Z" }, + { url = "https://files.pythonhosted.org/packages/15/30/346e4683532011561cd9c8dfeac6a8153dd96452fee0b12666058ab7893c/psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c", size = 3274806, upload_time = "2024-10-16T11:19:43.5Z" }, + { url = "https://files.pythonhosted.org/packages/66/6e/4efebe76f76aee7ec99166b6c023ff8abdc4e183f7b70913d7c047701b79/psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c", size = 2851370, upload_time = "2024-10-16T11:19:46.986Z" }, + { url = "https://files.pythonhosted.org/packages/7f/fd/ff83313f86b50f7ca089b161b8e0a22bb3c319974096093cd50680433fdb/psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb", size = 3080780, upload_time = "2024-10-16T11:19:50.242Z" }, + { url = "https://files.pythonhosted.org/packages/e6/c4/bfadd202dcda8333a7ccafdc51c541dbdfce7c2c7cda89fa2374455d795f/psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341", size = 3264583, upload_time = "2024-10-16T11:19:54.424Z" }, + { url = "https://files.pythonhosted.org/packages/5d/f1/09f45ac25e704ac954862581f9f9ae21303cc5ded3d0b775532b407f0e90/psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a", size = 3019831, upload_time = "2024-10-16T11:19:57.762Z" }, + { url = "https://files.pythonhosted.org/packages/9e/2e/9beaea078095cc558f215e38f647c7114987d9febfc25cb2beed7c3582a5/psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b", size = 2871822, upload_time = "2024-10-16T11:20:04.693Z" }, + { url = "https://files.pythonhosted.org/packages/01/9e/ef93c5d93f3dc9fc92786ffab39e323b9aed066ba59fdc34cf85e2722271/psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7", size = 2820975, upload_time = "2024-10-16T11:20:11.401Z" }, + { url = "https://files.pythonhosted.org/packages/a5/f0/049e9631e3268fe4c5a387f6fc27e267ebe199acf1bc1bc9cbde4bd6916c/psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e", size = 2919320, upload_time = "2024-10-16T11:20:17.959Z" }, + { url = "https://files.pythonhosted.org/packages/dc/9a/bcb8773b88e45fb5a5ea8339e2104d82c863a3b8558fbb2aadfe66df86b3/psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68", size = 2957617, upload_time = "2024-10-16T11:20:24.711Z" }, + { url = "https://files.pythonhosted.org/packages/e2/6b/144336a9bf08a67d217b3af3246abb1d027095dab726f0687f01f43e8c03/psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392", size = 1024618, upload_time = "2024-10-16T11:20:27.718Z" }, + { url = "https://files.pythonhosted.org/packages/61/69/3b3d7bd583c6d3cbe5100802efa5beacaacc86e37b653fc708bf3d6853b8/psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4", size = 1163816, upload_time = "2024-10-16T11:20:30.777Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/465cc9795cf76f6d329efdafca74693714556ea3891813701ac1fee87545/psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0", size = 3044771, upload_time = "2024-10-16T11:20:35.234Z" }, + { url = "https://files.pythonhosted.org/packages/8b/31/6d225b7b641a1a2148e3ed65e1aa74fc86ba3fee850545e27be9e1de893d/psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a", size = 3275336, upload_time = "2024-10-16T11:20:38.742Z" }, + { url = "https://files.pythonhosted.org/packages/30/b7/a68c2b4bff1cbb1728e3ec864b2d92327c77ad52edcd27922535a8366f68/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539", size = 2851637, upload_time = "2024-10-16T11:20:42.145Z" }, + { url = "https://files.pythonhosted.org/packages/0b/b1/cfedc0e0e6f9ad61f8657fd173b2f831ce261c02a08c0b09c652b127d813/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526", size = 3082097, upload_time = "2024-10-16T11:20:46.185Z" }, + { url = "https://files.pythonhosted.org/packages/18/ed/0a8e4153c9b769f59c02fb5e7914f20f0b2483a19dae7bf2db54b743d0d0/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1", size = 3264776, upload_time = "2024-10-16T11:20:50.879Z" }, + { url = "https://files.pythonhosted.org/packages/10/db/d09da68c6a0cdab41566b74e0a6068a425f077169bed0946559b7348ebe9/psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e", size = 3020968, upload_time = "2024-10-16T11:20:56.819Z" }, + { url = "https://files.pythonhosted.org/packages/94/28/4d6f8c255f0dfffb410db2b3f9ac5218d959a66c715c34cac31081e19b95/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f", size = 2872334, upload_time = "2024-10-16T11:21:02.411Z" }, + { url = "https://files.pythonhosted.org/packages/05/f7/20d7bf796593c4fea95e12119d6cc384ff1f6141a24fbb7df5a668d29d29/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00", size = 2822722, upload_time = "2024-10-16T11:21:09.01Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e4/0c407ae919ef626dbdb32835a03b6737013c3cc7240169843965cada2bdf/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5", size = 2920132, upload_time = "2024-10-16T11:21:16.339Z" }, + { url = "https://files.pythonhosted.org/packages/2d/70/aa69c9f69cf09a01da224909ff6ce8b68faeef476f00f7ec377e8f03be70/psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47", size = 2959312, upload_time = "2024-10-16T11:21:25.584Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/213e59854fafe87ba47814bf413ace0dcee33a89c8c8c814faca6bc7cf3c/psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64", size = 1025191, upload_time = "2024-10-16T11:21:29.912Z" }, + { url = "https://files.pythonhosted.org/packages/92/29/06261ea000e2dc1e22907dbbc483a1093665509ea586b29b8986a0e56733/psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0", size = 1164031, upload_time = "2024-10-16T11:21:34.211Z" }, + { url = "https://files.pythonhosted.org/packages/3e/30/d41d3ba765609c0763505d565c4d12d8f3c79793f0d0f044ff5a28bf395b/psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d", size = 3044699, upload_time = "2024-10-16T11:21:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/35/44/257ddadec7ef04536ba71af6bc6a75ec05c5343004a7ec93006bee66c0bc/psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb", size = 3275245, upload_time = "2024-10-16T11:21:51.989Z" }, + { url = "https://files.pythonhosted.org/packages/1b/11/48ea1cd11de67f9efd7262085588790a95d9dfcd9b8a687d46caf7305c1a/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7", size = 2851631, upload_time = "2024-10-16T11:21:57.584Z" }, + { url = "https://files.pythonhosted.org/packages/62/e0/62ce5ee650e6c86719d621a761fe4bc846ab9eff8c1f12b1ed5741bf1c9b/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d", size = 3082140, upload_time = "2024-10-16T11:22:02.005Z" }, + { url = "https://files.pythonhosted.org/packages/27/ce/63f946c098611f7be234c0dd7cb1ad68b0b5744d34f68062bb3c5aa510c8/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73", size = 3264762, upload_time = "2024-10-16T11:22:06.412Z" }, + { url = "https://files.pythonhosted.org/packages/43/25/c603cd81402e69edf7daa59b1602bd41eb9859e2824b8c0855d748366ac9/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673", size = 3020967, upload_time = "2024-10-16T11:22:11.583Z" }, + { url = "https://files.pythonhosted.org/packages/5f/d6/8708d8c6fca531057fa170cdde8df870e8b6a9b136e82b361c65e42b841e/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f", size = 2872326, upload_time = "2024-10-16T11:22:16.406Z" }, + { url = "https://files.pythonhosted.org/packages/ce/ac/5b1ea50fc08a9df82de7e1771537557f07c2632231bbab652c7e22597908/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909", size = 2822712, upload_time = "2024-10-16T11:22:21.366Z" }, + { url = "https://files.pythonhosted.org/packages/c4/fc/504d4503b2abc4570fac3ca56eb8fed5e437bf9c9ef13f36b6621db8ef00/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1", size = 2920155, upload_time = "2024-10-16T11:22:25.684Z" }, + { url = "https://files.pythonhosted.org/packages/b2/d1/323581e9273ad2c0dbd1902f3fb50c441da86e894b6e25a73c3fda32c57e/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567", size = 2959356, upload_time = "2024-10-16T11:22:30.562Z" }, + { url = "https://files.pythonhosted.org/packages/08/50/d13ea0a054189ae1bc21af1d85b6f8bb9bbc5572991055d70ad9006fe2d6/psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142", size = 2569224, upload_time = "2025-01-04T20:09:19.234Z" }, +] + +[[package]] +name = "pyarrow" +version = "20.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload_time = "2025-04-27T12:34:23.264Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload_time = "2025-04-27T12:28:40.78Z" }, + { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload_time = "2025-04-27T12:28:47.051Z" }, + { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload_time = "2025-04-27T12:28:55.064Z" }, + { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload_time = "2025-04-27T12:29:02.13Z" }, + { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload_time = "2025-04-27T12:29:09.951Z" }, + { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload_time = "2025-04-27T12:29:17.187Z" }, + { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload_time = "2025-04-27T12:29:24.253Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload_time = "2025-04-27T12:29:32.782Z" }, + { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload_time = "2025-04-27T12:29:38.464Z" }, + { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload_time = "2025-04-27T12:29:44.384Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload_time = "2025-04-27T12:29:52.038Z" }, + { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload_time = "2025-04-27T12:29:59.452Z" }, + { url = "https://files.pythonhosted.org/packages/af/a9/3bdd799e2c9b20c1ea6dc6fa8e83f29480a97711cf806e823f808c2316ac/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd", size = 42421775, upload_time = "2025-04-27T12:30:06.875Z" }, + { url = "https://files.pythonhosted.org/packages/10/f7/da98ccd86354c332f593218101ae56568d5dcedb460e342000bd89c49cc1/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28", size = 40687231, upload_time = "2025-04-27T12:30:13.954Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1b/2168d6050e52ff1e6cefc61d600723870bf569cbf41d13db939c8cf97a16/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8", size = 42295639, upload_time = "2025-04-27T12:30:21.949Z" }, + { url = "https://files.pythonhosted.org/packages/b2/66/2d976c0c7158fd25591c8ca55aee026e6d5745a021915a1835578707feb3/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e", size = 42908549, upload_time = "2025-04-27T12:30:29.551Z" }, + { url = "https://files.pythonhosted.org/packages/31/a9/dfb999c2fc6911201dcbf348247f9cc382a8990f9ab45c12eabfd7243a38/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a", size = 44557216, upload_time = "2025-04-27T12:30:36.977Z" }, + { url = "https://files.pythonhosted.org/packages/a0/8e/9adee63dfa3911be2382fb4d92e4b2e7d82610f9d9f668493bebaa2af50f/pyarrow-20.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b", size = 25660496, upload_time = "2025-04-27T12:30:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload_time = "2025-04-27T12:30:48.351Z" }, + { url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload_time = "2025-04-27T12:30:55.238Z" }, + { url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload_time = "2025-04-27T12:31:05.587Z" }, + { url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload_time = "2025-04-27T12:31:15.675Z" }, + { url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload_time = "2025-04-27T12:31:24.631Z" }, + { url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload_time = "2025-04-27T12:31:31.311Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload_time = "2025-04-27T12:31:39.406Z" }, + { url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload_time = "2025-04-27T12:31:45.997Z" }, + { url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload_time = "2025-04-27T12:31:54.11Z" }, + { url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload_time = "2025-04-27T12:31:59.215Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload_time = "2025-04-27T12:32:05.369Z" }, + { url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload_time = "2025-04-27T12:32:11.814Z" }, + { url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload_time = "2025-04-27T12:32:20.766Z" }, + { url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload_time = "2025-04-27T12:32:28.1Z" }, + { url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload_time = "2025-04-27T12:32:35.792Z" }, + { url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload_time = "2025-04-27T12:32:46.64Z" }, + { url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload_time = "2025-04-27T12:32:56.503Z" }, + { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload_time = "2025-04-27T12:33:04.72Z" }, +] + +[[package]] +name = "pyarrow-hotfix" +version = "0.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d2/ed/c3e8677f7abf3981838c2af7b5ac03e3589b3ef94fcb31d575426abae904/pyarrow_hotfix-0.7.tar.gz", hash = "sha256:59399cd58bdd978b2e42816a4183a55c6472d4e33d183351b6069f11ed42661d", size = 9910, upload_time = "2025-04-25T10:17:06.247Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/c3/94ade4906a2f88bc935772f59c934013b4205e773bcb4239db114a6da136/pyarrow_hotfix-0.7-py3-none-any.whl", hash = "sha256:3236f3b5f1260f0e2ac070a55c1a7b339c4bb7267839bd2015e283234e758100", size = 7923, upload_time = "2025-04-25T10:17:05.224Z" }, +] + +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload_time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload_time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload_time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload_time = "2025-03-28T02:41:19.028Z" }, +] + +[[package]] +name = "pydantic" +version = "2.10.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681, upload_time = "2025-01-24T01:42:12.693Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696, upload_time = "2025-01-24T01:42:10.371Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443, upload_time = "2024-12-18T11:31:54.917Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/89/f3450af9d09d44eea1f2c369f49e8f181d742f28220f88cc4dfaae91ea6e/pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc", size = 1893421, upload_time = "2024-12-18T11:27:55.409Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e3/71fe85af2021f3f386da42d291412e5baf6ce7716bd7101ea49c810eda90/pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7", size = 1814998, upload_time = "2024-12-18T11:27:57.252Z" }, + { url = "https://files.pythonhosted.org/packages/a6/3c/724039e0d848fd69dbf5806894e26479577316c6f0f112bacaf67aa889ac/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15", size = 1826167, upload_time = "2024-12-18T11:27:59.146Z" }, + { url = "https://files.pythonhosted.org/packages/2b/5b/1b29e8c1fb5f3199a9a57c1452004ff39f494bbe9bdbe9a81e18172e40d3/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306", size = 1865071, upload_time = "2024-12-18T11:28:02.625Z" }, + { url = "https://files.pythonhosted.org/packages/89/6c/3985203863d76bb7d7266e36970d7e3b6385148c18a68cc8915fd8c84d57/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99", size = 2036244, upload_time = "2024-12-18T11:28:04.442Z" }, + { url = "https://files.pythonhosted.org/packages/0e/41/f15316858a246b5d723f7d7f599f79e37493b2e84bfc789e58d88c209f8a/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459", size = 2737470, upload_time = "2024-12-18T11:28:07.679Z" }, + { url = "https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048", size = 1992291, upload_time = "2024-12-18T11:28:10.297Z" }, + { url = "https://files.pythonhosted.org/packages/bf/73/42c3742a391eccbeab39f15213ecda3104ae8682ba3c0c28069fbcb8c10d/pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d", size = 1994613, upload_time = "2024-12-18T11:28:13.362Z" }, + { url = "https://files.pythonhosted.org/packages/94/7a/941e89096d1175d56f59340f3a8ebaf20762fef222c298ea96d36a6328c5/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b", size = 2002355, upload_time = "2024-12-18T11:28:16.587Z" }, + { url = "https://files.pythonhosted.org/packages/6e/95/2359937a73d49e336a5a19848713555605d4d8d6940c3ec6c6c0ca4dcf25/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474", size = 2126661, upload_time = "2024-12-18T11:28:18.407Z" }, + { url = "https://files.pythonhosted.org/packages/2b/4c/ca02b7bdb6012a1adef21a50625b14f43ed4d11f1fc237f9d7490aa5078c/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6", size = 2153261, upload_time = "2024-12-18T11:28:21.471Z" }, + { url = "https://files.pythonhosted.org/packages/72/9d/a241db83f973049a1092a079272ffe2e3e82e98561ef6214ab53fe53b1c7/pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c", size = 1812361, upload_time = "2024-12-18T11:28:23.53Z" }, + { url = "https://files.pythonhosted.org/packages/e8/ef/013f07248041b74abd48a385e2110aa3a9bbfef0fbd97d4e6d07d2f5b89a/pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc", size = 1982484, upload_time = "2024-12-18T11:28:25.391Z" }, + { url = "https://files.pythonhosted.org/packages/10/1c/16b3a3e3398fd29dca77cea0a1d998d6bde3902fa2706985191e2313cc76/pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4", size = 1867102, upload_time = "2024-12-18T11:28:28.593Z" }, + { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127, upload_time = "2024-12-18T11:28:30.346Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340, upload_time = "2024-12-18T11:28:32.521Z" }, + { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900, upload_time = "2024-12-18T11:28:34.507Z" }, + { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177, upload_time = "2024-12-18T11:28:36.488Z" }, + { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046, upload_time = "2024-12-18T11:28:39.409Z" }, + { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386, upload_time = "2024-12-18T11:28:41.221Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060, upload_time = "2024-12-18T11:28:44.709Z" }, + { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870, upload_time = "2024-12-18T11:28:46.839Z" }, + { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822, upload_time = "2024-12-18T11:28:48.896Z" }, + { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364, upload_time = "2024-12-18T11:28:50.755Z" }, + { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303, upload_time = "2024-12-18T11:28:54.122Z" }, + { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064, upload_time = "2024-12-18T11:28:56.074Z" }, + { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046, upload_time = "2024-12-18T11:28:58.107Z" }, + { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092, upload_time = "2024-12-18T11:29:01.335Z" }, + { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709, upload_time = "2024-12-18T11:29:03.193Z" }, + { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273, upload_time = "2024-12-18T11:29:05.306Z" }, + { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027, upload_time = "2024-12-18T11:29:07.294Z" }, + { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888, upload_time = "2024-12-18T11:29:09.249Z" }, + { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738, upload_time = "2024-12-18T11:29:11.23Z" }, + { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138, upload_time = "2024-12-18T11:29:16.396Z" }, + { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025, upload_time = "2024-12-18T11:29:20.25Z" }, + { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633, upload_time = "2024-12-18T11:29:23.877Z" }, + { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404, upload_time = "2024-12-18T11:29:25.872Z" }, + { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130, upload_time = "2024-12-18T11:29:29.252Z" }, + { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946, upload_time = "2024-12-18T11:29:31.338Z" }, + { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387, upload_time = "2024-12-18T11:29:33.481Z" }, + { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453, upload_time = "2024-12-18T11:29:35.533Z" }, + { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186, upload_time = "2024-12-18T11:29:37.649Z" }, +] + +[[package]] +name = "pyee" +version = "12.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d2/a7/8faaa62a488a2a1e0d56969757f087cbd2729e9bcfa508c230299f366b4c/pyee-12.0.0.tar.gz", hash = "sha256:c480603f4aa2927d4766eb41fa82793fe60a82cbfdb8d688e0d08c55a534e145", size = 29675, upload_time = "2024-08-30T19:40:43.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/0d/95993c08c721ec68892547f2117e8f9dfbcef2ca71e098533541b4a54d5f/pyee-12.0.0-py3-none-any.whl", hash = "sha256:7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990", size = 14831, upload_time = "2024-08-30T19:40:42.132Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581, upload_time = "2025-01-06T17:26:30.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293, upload_time = "2025-01-06T17:26:25.553Z" }, +] + +[[package]] +name = "pyjwt" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785, upload_time = "2024-11-28T03:43:29.933Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997, upload_time = "2024-11-28T03:43:27.893Z" }, +] + +[[package]] +name = "pyparsing" +version = "3.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/22/f1129e69d94ffff626bdb5c835506b3a5b4f3d070f17ea295e12c2c6f60f/pyparsing-3.2.3.tar.gz", hash = "sha256:b9c13f1ab8b3b542f72e28f634bad4de758ab3ce4546e4301970ad6fa77c38be", size = 1088608, upload_time = "2025-03-25T05:01:28.114Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload_time = "2025-03-25T05:01:24.908Z" }, +] + +[[package]] +name = "pytest" +version = "8.3.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload_time = "2025-03-02T12:54:54.503Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload_time = "2025-03-02T12:54:52.069Z" }, +] + +[[package]] +name = "pytest-asyncio" +version = "0.25.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239, upload_time = "2025-01-28T18:37:58.729Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467, upload_time = "2025-01-28T18:37:56.798Z" }, +] + +[[package]] +name = "pytest-mock" +version = "3.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/96/25588c55fbe330b751bd7c7d723c3544957566bc090f6d506551b514f488/pytest-mock-3.12.0.tar.gz", hash = "sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9", size = 32139, upload_time = "2023-10-19T16:25:57.7Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/25/b29fd10dd062cf41e66787a7951b3842881a2a2d7e3a41fcbb58a8466046/pytest_mock-3.12.0-py3-none-any.whl", hash = "sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f", size = 9771, upload_time = "2023-10-19T16:25:55.764Z" }, +] + +[[package]] +name = "pytest-timeout" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/93/0d/04719abc7a4bdb3a7a1f968f24b0f5253d698c9cc94975330e9d3145befb/pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9", size = 17697, upload_time = "2024-03-07T21:04:01.069Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/27/14af9ef8321f5edc7527e47def2a21d8118c6f329a9342cc61387a0c0599/pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e", size = 14148, upload_time = "2024-03-07T21:03:58.764Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload_time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload_time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bc/57/e84d88dfe0aec03b7a2d4327012c1627ab5f03652216c63d49846d7a6c58/python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", size = 39115, upload_time = "2024-01-23T06:33:00.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload_time = "2024-01-23T06:32:58.246Z" }, +] + +[[package]] +name = "python-multipart" +version = "0.0.20" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload_time = "2024-12-16T19:45:46.972Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload_time = "2024-12-16T19:45:44.423Z" }, +] + +[[package]] +name = "python-slugify" +version = "8.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "text-unidecode" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/87/c7/5e1547c44e31da50a460df93af11a535ace568ef89d7a811069ead340c4a/python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856", size = 10921, upload_time = "2024-02-08T18:32:45.488Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051, upload_time = "2024-02-08T18:32:43.911Z" }, +] + +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload_time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" }, +] + +[[package]] +name = "pywin32" +version = "310" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/b1/68aa2986129fb1011dabbe95f0136f44509afaf072b12b8f815905a39f33/pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd", size = 8784284, upload_time = "2025-03-17T00:55:53.124Z" }, + { url = "https://files.pythonhosted.org/packages/b3/bd/d1592635992dd8db5bb8ace0551bc3a769de1ac8850200cfa517e72739fb/pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c", size = 9520748, upload_time = "2025-03-17T00:55:55.203Z" }, + { url = "https://files.pythonhosted.org/packages/90/b1/ac8b1ffce6603849eb45a91cf126c0fa5431f186c2e768bf56889c46f51c/pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582", size = 8455941, upload_time = "2025-03-17T00:55:57.048Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ec/4fdbe47932f671d6e348474ea35ed94227fb5df56a7c30cbbb42cd396ed0/pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d", size = 8796239, upload_time = "2025-03-17T00:55:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e5/b0627f8bb84e06991bea89ad8153a9e50ace40b2e1195d68e9dff6b03d0f/pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060", size = 9503839, upload_time = "2025-03-17T00:56:00.8Z" }, + { url = "https://files.pythonhosted.org/packages/1f/32/9ccf53748df72301a89713936645a664ec001abd35ecc8578beda593d37d/pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966", size = 8459470, upload_time = "2025-03-17T00:56:02.601Z" }, + { url = "https://files.pythonhosted.org/packages/1c/09/9c1b978ffc4ae53999e89c19c77ba882d9fce476729f23ef55211ea1c034/pywin32-310-cp313-cp313-win32.whl", hash = "sha256:5d241a659c496ada3253cd01cfaa779b048e90ce4b2b38cd44168ad555ce74ab", size = 8794384, upload_time = "2025-03-17T00:56:04.383Z" }, + { url = "https://files.pythonhosted.org/packages/45/3c/b4640f740ffebadd5d34df35fecba0e1cfef8fde9f3e594df91c28ad9b50/pywin32-310-cp313-cp313-win_amd64.whl", hash = "sha256:667827eb3a90208ddbdcc9e860c81bde63a135710e21e4cb3348968e4bd5249e", size = 9503039, upload_time = "2025-03-17T00:56:06.207Z" }, + { url = "https://files.pythonhosted.org/packages/b4/f4/f785020090fb050e7fb6d34b780f2231f302609dc964672f72bfaeb59a28/pywin32-310-cp313-cp313-win_arm64.whl", hash = "sha256:e308f831de771482b7cf692a1f308f8fca701b2d8f9dde6cc440c7da17e47b33", size = 8458152, upload_time = "2025-03-17T00:56:07.819Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload_time = "2024-08-06T20:33:50.674Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload_time = "2024-08-06T20:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload_time = "2024-08-06T20:32:04.926Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload_time = "2024-08-06T20:32:06.459Z" }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload_time = "2024-08-06T20:32:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload_time = "2024-08-06T20:32:14.124Z" }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload_time = "2024-08-06T20:32:16.17Z" }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload_time = "2024-08-06T20:32:18.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload_time = "2024-08-06T20:32:19.889Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload_time = "2024-08-06T20:32:21.273Z" }, + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload_time = "2024-08-06T20:32:25.131Z" }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload_time = "2024-08-06T20:32:26.511Z" }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload_time = "2024-08-06T20:32:28.363Z" }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload_time = "2024-08-06T20:32:30.058Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload_time = "2024-08-06T20:32:31.881Z" }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload_time = "2024-08-06T20:32:37.083Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload_time = "2024-08-06T20:32:38.898Z" }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload_time = "2024-08-06T20:32:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload_time = "2024-08-06T20:32:41.93Z" }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload_time = "2024-08-06T20:32:43.4Z" }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload_time = "2024-08-06T20:32:44.801Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload_time = "2024-08-06T20:32:46.432Z" }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload_time = "2024-08-06T20:32:51.188Z" }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload_time = "2024-08-06T20:32:53.019Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload_time = "2024-08-06T20:32:54.708Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload_time = "2024-08-06T20:32:56.985Z" }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload_time = "2024-08-06T20:33:03.001Z" }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload_time = "2024-08-06T20:33:04.33Z" }, +] + +[[package]] +name = "regex" +version = "2024.11.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/5f/bd69653fbfb76cf8604468d3b4ec4c403197144c7bfe0e6a5fc9e02a07cb/regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519", size = 399494, upload_time = "2024-11-06T20:12:31.635Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/58/7e4d9493a66c88a7da6d205768119f51af0f684fe7be7bac8328e217a52c/regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638", size = 482669, upload_time = "2024-11-06T20:09:31.064Z" }, + { url = "https://files.pythonhosted.org/packages/34/4c/8f8e631fcdc2ff978609eaeef1d6994bf2f028b59d9ac67640ed051f1218/regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7", size = 287684, upload_time = "2024-11-06T20:09:32.915Z" }, + { url = "https://files.pythonhosted.org/packages/c5/1b/f0e4d13e6adf866ce9b069e191f303a30ab1277e037037a365c3aad5cc9c/regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20", size = 284589, upload_time = "2024-11-06T20:09:35.504Z" }, + { url = "https://files.pythonhosted.org/packages/25/4d/ab21047f446693887f25510887e6820b93f791992994f6498b0318904d4a/regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114", size = 792121, upload_time = "2024-11-06T20:09:37.701Z" }, + { url = "https://files.pythonhosted.org/packages/45/ee/c867e15cd894985cb32b731d89576c41a4642a57850c162490ea34b78c3b/regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3", size = 831275, upload_time = "2024-11-06T20:09:40.371Z" }, + { url = "https://files.pythonhosted.org/packages/b3/12/b0f480726cf1c60f6536fa5e1c95275a77624f3ac8fdccf79e6727499e28/regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f", size = 818257, upload_time = "2024-11-06T20:09:43.059Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ce/0d0e61429f603bac433910d99ef1a02ce45a8967ffbe3cbee48599e62d88/regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0", size = 792727, upload_time = "2024-11-06T20:09:48.19Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c1/243c83c53d4a419c1556f43777ccb552bccdf79d08fda3980e4e77dd9137/regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55", size = 780667, upload_time = "2024-11-06T20:09:49.828Z" }, + { url = "https://files.pythonhosted.org/packages/c5/f4/75eb0dd4ce4b37f04928987f1d22547ddaf6c4bae697623c1b05da67a8aa/regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89", size = 776963, upload_time = "2024-11-06T20:09:51.819Z" }, + { url = "https://files.pythonhosted.org/packages/16/5d/95c568574e630e141a69ff8a254c2f188b4398e813c40d49228c9bbd9875/regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d", size = 784700, upload_time = "2024-11-06T20:09:53.982Z" }, + { url = "https://files.pythonhosted.org/packages/8e/b5/f8495c7917f15cc6fee1e7f395e324ec3e00ab3c665a7dc9d27562fd5290/regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34", size = 848592, upload_time = "2024-11-06T20:09:56.222Z" }, + { url = "https://files.pythonhosted.org/packages/1c/80/6dd7118e8cb212c3c60b191b932dc57db93fb2e36fb9e0e92f72a5909af9/regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d", size = 852929, upload_time = "2024-11-06T20:09:58.642Z" }, + { url = "https://files.pythonhosted.org/packages/11/9b/5a05d2040297d2d254baf95eeeb6df83554e5e1df03bc1a6687fc4ba1f66/regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45", size = 781213, upload_time = "2024-11-06T20:10:00.867Z" }, + { url = "https://files.pythonhosted.org/packages/26/b7/b14e2440156ab39e0177506c08c18accaf2b8932e39fb092074de733d868/regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9", size = 261734, upload_time = "2024-11-06T20:10:03.361Z" }, + { url = "https://files.pythonhosted.org/packages/80/32/763a6cc01d21fb3819227a1cc3f60fd251c13c37c27a73b8ff4315433a8e/regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60", size = 274052, upload_time = "2024-11-06T20:10:05.179Z" }, + { url = "https://files.pythonhosted.org/packages/ba/30/9a87ce8336b172cc232a0db89a3af97929d06c11ceaa19d97d84fa90a8f8/regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a", size = 483781, upload_time = "2024-11-06T20:10:07.07Z" }, + { url = "https://files.pythonhosted.org/packages/01/e8/00008ad4ff4be8b1844786ba6636035f7ef926db5686e4c0f98093612add/regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9", size = 288455, upload_time = "2024-11-06T20:10:09.117Z" }, + { url = "https://files.pythonhosted.org/packages/60/85/cebcc0aff603ea0a201667b203f13ba75d9fc8668fab917ac5b2de3967bc/regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2", size = 284759, upload_time = "2024-11-06T20:10:11.155Z" }, + { url = "https://files.pythonhosted.org/packages/94/2b/701a4b0585cb05472a4da28ee28fdfe155f3638f5e1ec92306d924e5faf0/regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4", size = 794976, upload_time = "2024-11-06T20:10:13.24Z" }, + { url = "https://files.pythonhosted.org/packages/4b/bf/fa87e563bf5fee75db8915f7352e1887b1249126a1be4813837f5dbec965/regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577", size = 833077, upload_time = "2024-11-06T20:10:15.37Z" }, + { url = "https://files.pythonhosted.org/packages/a1/56/7295e6bad94b047f4d0834e4779491b81216583c00c288252ef625c01d23/regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3", size = 823160, upload_time = "2024-11-06T20:10:19.027Z" }, + { url = "https://files.pythonhosted.org/packages/fb/13/e3b075031a738c9598c51cfbc4c7879e26729c53aa9cca59211c44235314/regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e", size = 796896, upload_time = "2024-11-06T20:10:21.85Z" }, + { url = "https://files.pythonhosted.org/packages/24/56/0b3f1b66d592be6efec23a795b37732682520b47c53da5a32c33ed7d84e3/regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe", size = 783997, upload_time = "2024-11-06T20:10:24.329Z" }, + { url = "https://files.pythonhosted.org/packages/f9/a1/eb378dada8b91c0e4c5f08ffb56f25fcae47bf52ad18f9b2f33b83e6d498/regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e", size = 781725, upload_time = "2024-11-06T20:10:28.067Z" }, + { url = "https://files.pythonhosted.org/packages/83/f2/033e7dec0cfd6dda93390089864732a3409246ffe8b042e9554afa9bff4e/regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29", size = 789481, upload_time = "2024-11-06T20:10:31.612Z" }, + { url = "https://files.pythonhosted.org/packages/83/23/15d4552ea28990a74e7696780c438aadd73a20318c47e527b47a4a5a596d/regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39", size = 852896, upload_time = "2024-11-06T20:10:34.054Z" }, + { url = "https://files.pythonhosted.org/packages/e3/39/ed4416bc90deedbfdada2568b2cb0bc1fdb98efe11f5378d9892b2a88f8f/regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51", size = 860138, upload_time = "2024-11-06T20:10:36.142Z" }, + { url = "https://files.pythonhosted.org/packages/93/2d/dd56bb76bd8e95bbce684326302f287455b56242a4f9c61f1bc76e28360e/regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad", size = 787692, upload_time = "2024-11-06T20:10:38.394Z" }, + { url = "https://files.pythonhosted.org/packages/0b/55/31877a249ab7a5156758246b9c59539abbeba22461b7d8adc9e8475ff73e/regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54", size = 262135, upload_time = "2024-11-06T20:10:40.367Z" }, + { url = "https://files.pythonhosted.org/packages/38/ec/ad2d7de49a600cdb8dd78434a1aeffe28b9d6fc42eb36afab4a27ad23384/regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b", size = 273567, upload_time = "2024-11-06T20:10:43.467Z" }, + { url = "https://files.pythonhosted.org/packages/90/73/bcb0e36614601016552fa9344544a3a2ae1809dc1401b100eab02e772e1f/regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84", size = 483525, upload_time = "2024-11-06T20:10:45.19Z" }, + { url = "https://files.pythonhosted.org/packages/0f/3f/f1a082a46b31e25291d830b369b6b0c5576a6f7fb89d3053a354c24b8a83/regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4", size = 288324, upload_time = "2024-11-06T20:10:47.177Z" }, + { url = "https://files.pythonhosted.org/packages/09/c9/4e68181a4a652fb3ef5099e077faf4fd2a694ea6e0f806a7737aff9e758a/regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0", size = 284617, upload_time = "2024-11-06T20:10:49.312Z" }, + { url = "https://files.pythonhosted.org/packages/fc/fd/37868b75eaf63843165f1d2122ca6cb94bfc0271e4428cf58c0616786dce/regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0", size = 795023, upload_time = "2024-11-06T20:10:51.102Z" }, + { url = "https://files.pythonhosted.org/packages/c4/7c/d4cd9c528502a3dedb5c13c146e7a7a539a3853dc20209c8e75d9ba9d1b2/regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7", size = 833072, upload_time = "2024-11-06T20:10:52.926Z" }, + { url = "https://files.pythonhosted.org/packages/4f/db/46f563a08f969159c5a0f0e722260568425363bea43bb7ae370becb66a67/regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7", size = 823130, upload_time = "2024-11-06T20:10:54.828Z" }, + { url = "https://files.pythonhosted.org/packages/db/60/1eeca2074f5b87df394fccaa432ae3fc06c9c9bfa97c5051aed70e6e00c2/regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c", size = 796857, upload_time = "2024-11-06T20:10:56.634Z" }, + { url = "https://files.pythonhosted.org/packages/10/db/ac718a08fcee981554d2f7bb8402f1faa7e868c1345c16ab1ebec54b0d7b/regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3", size = 784006, upload_time = "2024-11-06T20:10:59.369Z" }, + { url = "https://files.pythonhosted.org/packages/c2/41/7da3fe70216cea93144bf12da2b87367590bcf07db97604edeea55dac9ad/regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07", size = 781650, upload_time = "2024-11-06T20:11:02.042Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d5/880921ee4eec393a4752e6ab9f0fe28009435417c3102fc413f3fe81c4e5/regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e", size = 789545, upload_time = "2024-11-06T20:11:03.933Z" }, + { url = "https://files.pythonhosted.org/packages/dc/96/53770115e507081122beca8899ab7f5ae28ae790bfcc82b5e38976df6a77/regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6", size = 853045, upload_time = "2024-11-06T20:11:06.497Z" }, + { url = "https://files.pythonhosted.org/packages/31/d3/1372add5251cc2d44b451bd94f43b2ec78e15a6e82bff6a290ef9fd8f00a/regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4", size = 860182, upload_time = "2024-11-06T20:11:09.06Z" }, + { url = "https://files.pythonhosted.org/packages/ed/e3/c446a64984ea9f69982ba1a69d4658d5014bc7a0ea468a07e1a1265db6e2/regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d", size = 787733, upload_time = "2024-11-06T20:11:11.256Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f1/e40c8373e3480e4f29f2692bd21b3e05f296d3afebc7e5dcf21b9756ca1c/regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff", size = 262122, upload_time = "2024-11-06T20:11:13.161Z" }, + { url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545, upload_time = "2024-11-06T20:11:15Z" }, +] + +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218, upload_time = "2024-05-29T15:37:49.536Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928, upload_time = "2024-05-29T15:37:47.027Z" }, +] + +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "oauthlib" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload_time = "2024-03-22T20:32:29.939Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload_time = "2024-03-22T20:32:28.055Z" }, +] + +[[package]] +name = "rich" +version = "14.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078, upload_time = "2025-03-30T14:15:14.23Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229, upload_time = "2025-03-30T14:15:12.283Z" }, +] + +[[package]] +name = "rich-toolkit" +version = "0.14.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/24/f0678256fbe0643b4ba00a460f4b736ef07042e459f8d4087c8b7011ab81/rich_toolkit-0.14.5.tar.gz", hash = "sha256:1cb7a3fa0bdbf35793460708664f3f797e8b18cedec9cd41a7e6125e4bc6272b", size = 104799, upload_time = "2025-05-05T10:19:24.521Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/13/621cc551b72de51e6e5cb7cfc510a141e1858bd380ee3c8108fbda4a6be0/rich_toolkit-0.14.5-py3-none-any.whl", hash = "sha256:2fe9846ecbf5d0cdf236c7f43452b68d9da1436a81594aba6b79b3c48b05703b", size = 24791, upload_time = "2025-05-05T10:19:23.346Z" }, +] + +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload_time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload_time = "2025-04-16T09:51:17.142Z" }, +] + +[[package]] +name = "safetensors" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/71/7e/2d5d6ee7b40c0682315367ec7475693d110f512922d582fef1bd4a63adc3/safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965", size = 67210, upload_time = "2025-02-26T09:15:13.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/ae/88f6c49dbd0cc4da0e08610019a3c78a7d390879a919411a410a1876d03a/safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073", size = 436917, upload_time = "2025-02-26T09:15:03.702Z" }, + { url = "https://files.pythonhosted.org/packages/b8/3b/11f1b4a2f5d2ab7da34ecc062b0bc301f2be024d110a6466726bec8c055c/safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7", size = 418419, upload_time = "2025-02-26T09:15:01.765Z" }, + { url = "https://files.pythonhosted.org/packages/5d/9a/add3e6fef267658075c5a41573c26d42d80c935cdc992384dfae435feaef/safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467", size = 459493, upload_time = "2025-02-26T09:14:51.812Z" }, + { url = "https://files.pythonhosted.org/packages/df/5c/bf2cae92222513cc23b3ff85c4a1bb2811a2c3583ac0f8e8d502751de934/safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e", size = 472400, upload_time = "2025-02-26T09:14:53.549Z" }, + { url = "https://files.pythonhosted.org/packages/58/11/7456afb740bd45782d0f4c8e8e1bb9e572f1bf82899fb6ace58af47b4282/safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d", size = 522891, upload_time = "2025-02-26T09:14:55.717Z" }, + { url = "https://files.pythonhosted.org/packages/57/3d/fe73a9d2ace487e7285f6e157afee2383bd1ddb911b7cb44a55cf812eae3/safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9", size = 537694, upload_time = "2025-02-26T09:14:57.036Z" }, + { url = "https://files.pythonhosted.org/packages/a6/f8/dae3421624fcc87a89d42e1898a798bc7ff72c61f38973a65d60df8f124c/safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a", size = 471642, upload_time = "2025-02-26T09:15:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/ce/20/1fbe16f9b815f6c5a672f5b760951e20e17e43f67f231428f871909a37f6/safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d", size = 502241, upload_time = "2025-02-26T09:14:58.303Z" }, + { url = "https://files.pythonhosted.org/packages/5f/18/8e108846b506487aa4629fe4116b27db65c3dde922de2c8e0cc1133f3f29/safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b", size = 638001, upload_time = "2025-02-26T09:15:05.79Z" }, + { url = "https://files.pythonhosted.org/packages/82/5a/c116111d8291af6c8c8a8b40628fe833b9db97d8141c2a82359d14d9e078/safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff", size = 734013, upload_time = "2025-02-26T09:15:07.892Z" }, + { url = "https://files.pythonhosted.org/packages/7d/ff/41fcc4d3b7de837963622e8610d998710705bbde9a8a17221d85e5d0baad/safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135", size = 670687, upload_time = "2025-02-26T09:15:09.979Z" }, + { url = "https://files.pythonhosted.org/packages/40/ad/2b113098e69c985a3d8fbda4b902778eae4a35b7d5188859b4a63d30c161/safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04", size = 643147, upload_time = "2025-02-26T09:15:11.185Z" }, + { url = "https://files.pythonhosted.org/packages/0a/0c/95aeb51d4246bd9a3242d3d8349c1112b4ee7611a4b40f0c5c93b05f001d/safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace", size = 296677, upload_time = "2025-02-26T09:15:16.554Z" }, + { url = "https://files.pythonhosted.org/packages/69/e2/b011c38e5394c4c18fb5500778a55ec43ad6106126e74723ffaee246f56e/safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11", size = 308878, upload_time = "2025-02-26T09:15:14.99Z" }, +] + +[[package]] +name = "setuptools" +version = "80.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/70/dc/3976b322de9d2e87ed0007cf04cc7553969b6c7b3f48a565d0333748fbcd/setuptools-80.3.1.tar.gz", hash = "sha256:31e2c58dbb67c99c289f51c16d899afedae292b978f8051efaf6262d8212f927", size = 1315082, upload_time = "2025-05-04T18:47:04.397Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/7e/5d8af3317ddbf9519b687bd1c39d8737fde07d97f54df65553faca5cffb1/setuptools-80.3.1-py3-none-any.whl", hash = "sha256:ea8e00d7992054c4c592aeb892f6ad51fe1b4d90cc6947cc45c45717c40ec537", size = 1201172, upload_time = "2025-05-04T18:47:02.575Z" }, +] + +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload_time = "2023-10-24T04:13:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload_time = "2023-10-24T04:13:38.866Z" }, +] + +[[package]] +name = "simplejson" +version = "3.20.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/92/51b417685abd96b31308b61b9acce7ec50d8e1de8fbc39a7fd4962c60689/simplejson-3.20.1.tar.gz", hash = "sha256:e64139b4ec4f1f24c142ff7dcafe55a22b811a74d86d66560c8815687143037d", size = 85591, upload_time = "2025-02-15T05:18:53.15Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/59/74bc90d1c051bc2432c96b34bd4e8036875ab58b4fcbe4d6a5a76985f853/simplejson-3.20.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:325b8c107253d3217e89d7b50c71015b5b31e2433e6c5bf38967b2f80630a8ca", size = 92132, upload_time = "2025-02-15T05:16:15.743Z" }, + { url = "https://files.pythonhosted.org/packages/71/c7/1970916e0c51794fff89f76da2f632aaf0b259b87753c88a8c409623d3e1/simplejson-3.20.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88a7baa8211089b9e58d78fbc1b0b322103f3f3d459ff16f03a36cece0d0fcf0", size = 74956, upload_time = "2025-02-15T05:16:17.062Z" }, + { url = "https://files.pythonhosted.org/packages/c8/0d/98cc5909180463f1d75fac7180de62d4cdb4e82c4fef276b9e591979372c/simplejson-3.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:299b1007b8101d50d95bc0db1bf5c38dc372e85b504cf77f596462083ee77e3f", size = 74772, upload_time = "2025-02-15T05:16:19.204Z" }, + { url = "https://files.pythonhosted.org/packages/e1/94/a30a5211a90d67725a3e8fcc1c788189f2ae2ed2b96b63ed15d0b7f5d6bb/simplejson-3.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ec618ed65caab48e81e3ed29586236a8e57daef792f1f3bb59504a7e98cd10", size = 143575, upload_time = "2025-02-15T05:16:21.337Z" }, + { url = "https://files.pythonhosted.org/packages/ee/08/cdb6821f1058eb5db46d252de69ff7e6c53f05f1bae6368fe20d5b51d37e/simplejson-3.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2cdead1d3197f0ff43373cf4730213420523ba48697743e135e26f3d179f38", size = 153241, upload_time = "2025-02-15T05:16:22.859Z" }, + { url = "https://files.pythonhosted.org/packages/4c/2d/ca3caeea0bdc5efc5503d5f57a2dfb56804898fb196dfada121323ee0ccb/simplejson-3.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3466d2839fdc83e1af42e07b90bc8ff361c4e8796cd66722a40ba14e458faddd", size = 141500, upload_time = "2025-02-15T05:16:25.068Z" }, + { url = "https://files.pythonhosted.org/packages/e1/33/d3e0779d5c58245e7370c98eb969275af6b7a4a5aec3b97cbf85f09ad328/simplejson-3.20.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d492ed8e92f3a9f9be829205f44b1d0a89af6582f0cf43e0d129fa477b93fe0c", size = 144757, upload_time = "2025-02-15T05:16:28.301Z" }, + { url = "https://files.pythonhosted.org/packages/54/53/2d93128bb55861b2fa36c5944f38da51a0bc6d83e513afc6f7838440dd15/simplejson-3.20.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f924b485537b640dc69434565463fd6fc0c68c65a8c6e01a823dd26c9983cf79", size = 144409, upload_time = "2025-02-15T05:16:29.687Z" }, + { url = "https://files.pythonhosted.org/packages/99/4c/dac310a98f897ad3435b4bdc836d92e78f09e38c5dbf28211ed21dc59fa2/simplejson-3.20.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e8eacf6a3491bf76ea91a8d46726368a6be0eb94993f60b8583550baae9439e", size = 146082, upload_time = "2025-02-15T05:16:31.064Z" }, + { url = "https://files.pythonhosted.org/packages/ee/22/d7ba958cfed39827335b82656b1c46f89678faecda9a7677b47e87b48ee6/simplejson-3.20.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d34d04bf90b4cea7c22d8b19091633908f14a096caa301b24c2f3d85b5068fb8", size = 154339, upload_time = "2025-02-15T05:16:32.719Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c8/b072b741129406a7086a0799c6f5d13096231bf35fdd87a0cffa789687fc/simplejson-3.20.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:69dd28d4ce38390ea4aaf212902712c0fd1093dc4c1ff67e09687c3c3e15a749", size = 147915, upload_time = "2025-02-15T05:16:34.291Z" }, + { url = "https://files.pythonhosted.org/packages/6c/46/8347e61e9cf3db5342a42f7fd30a81b4f5cf85977f916852d7674a540907/simplejson-3.20.1-cp311-cp311-win32.whl", hash = "sha256:dfe7a9da5fd2a3499436cd350f31539e0a6ded5da6b5b3d422df016444d65e43", size = 73972, upload_time = "2025-02-15T05:16:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/01/85/b52f24859237b4e9d523d5655796d911ba3d46e242eb1959c45b6af5aedd/simplejson-3.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:896a6c04d7861d507d800da7642479c3547060bf97419d9ef73d98ced8258766", size = 75595, upload_time = "2025-02-15T05:16:36.957Z" }, + { url = "https://files.pythonhosted.org/packages/8d/eb/34c16a1ac9ba265d024dc977ad84e1659d931c0a700967c3e59a98ed7514/simplejson-3.20.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f31c4a3a7ab18467ee73a27f3e59158255d1520f3aad74315edde7a940f1be23", size = 93100, upload_time = "2025-02-15T05:16:38.801Z" }, + { url = "https://files.pythonhosted.org/packages/41/fc/2c2c007d135894971e6814e7c0806936e5bade28f8db4dd7e2a58b50debd/simplejson-3.20.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:884e6183d16b725e113b83a6fc0230152ab6627d4d36cb05c89c2c5bccfa7bc6", size = 75464, upload_time = "2025-02-15T05:16:40.905Z" }, + { url = "https://files.pythonhosted.org/packages/0f/05/2b5ecb33b776c34bb5cace5de5d7669f9b60e3ca13c113037b2ca86edfbd/simplejson-3.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03d7a426e416fe0d3337115f04164cd9427eb4256e843a6b8751cacf70abc832", size = 75112, upload_time = "2025-02-15T05:16:42.246Z" }, + { url = "https://files.pythonhosted.org/packages/fe/36/1f3609a2792f06cd4b71030485f78e91eb09cfd57bebf3116bf2980a8bac/simplejson-3.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:000602141d0bddfcff60ea6a6e97d5e10c9db6b17fd2d6c66199fa481b6214bb", size = 150182, upload_time = "2025-02-15T05:16:43.557Z" }, + { url = "https://files.pythonhosted.org/packages/2f/b0/053fbda38b8b602a77a4f7829def1b4f316cd8deb5440a6d3ee90790d2a4/simplejson-3.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:af8377a8af78226e82e3a4349efdde59ffa421ae88be67e18cef915e4023a595", size = 158363, upload_time = "2025-02-15T05:16:45.748Z" }, + { url = "https://files.pythonhosted.org/packages/d1/4b/2eb84ae867539a80822e92f9be4a7200dffba609275faf99b24141839110/simplejson-3.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c7de4c88ab2fbcb8781a3b982ef883696736134e20b1210bca43fb42ff1acf", size = 148415, upload_time = "2025-02-15T05:16:47.861Z" }, + { url = "https://files.pythonhosted.org/packages/e0/bd/400b0bd372a5666addf2540c7358bfc3841b9ce5cdbc5cc4ad2f61627ad8/simplejson-3.20.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:455a882ff3f97d810709f7b620007d4e0aca8da71d06fc5c18ba11daf1c4df49", size = 152213, upload_time = "2025-02-15T05:16:49.25Z" }, + { url = "https://files.pythonhosted.org/packages/50/12/143f447bf6a827ee9472693768dc1a5eb96154f8feb140a88ce6973a3cfa/simplejson-3.20.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fc0f523ce923e7f38eb67804bc80e0a028c76d7868500aa3f59225574b5d0453", size = 150048, upload_time = "2025-02-15T05:16:51.5Z" }, + { url = "https://files.pythonhosted.org/packages/5e/ea/dd9b3e8e8ed710a66f24a22c16a907c9b539b6f5f45fd8586bd5c231444e/simplejson-3.20.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76461ec929282dde4a08061071a47281ad939d0202dc4e63cdd135844e162fbc", size = 151668, upload_time = "2025-02-15T05:16:53Z" }, + { url = "https://files.pythonhosted.org/packages/99/af/ee52a8045426a0c5b89d755a5a70cc821815ef3c333b56fbcad33c4435c0/simplejson-3.20.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19c2da8c043607bde4d4ef3a6b633e668a7d2e3d56f40a476a74c5ea71949f", size = 158840, upload_time = "2025-02-15T05:16:54.851Z" }, + { url = "https://files.pythonhosted.org/packages/68/db/ab32869acea6b5de7d75fa0dac07a112ded795d41eaa7e66c7813b17be95/simplejson-3.20.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b2578bedaedf6294415197b267d4ef678fea336dd78ee2a6d2f4b028e9d07be3", size = 154212, upload_time = "2025-02-15T05:16:56.318Z" }, + { url = "https://files.pythonhosted.org/packages/fa/7a/e3132d454977d75a3bf9a6d541d730f76462ebf42a96fea2621498166f41/simplejson-3.20.1-cp312-cp312-win32.whl", hash = "sha256:339f407373325a36b7fd744b688ba5bae0666b5d340ec6d98aebc3014bf3d8ea", size = 74101, upload_time = "2025-02-15T05:16:57.746Z" }, + { url = "https://files.pythonhosted.org/packages/bc/5d/4e243e937fa3560107c69f6f7c2eed8589163f5ed14324e864871daa2dd9/simplejson-3.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:627d4486a1ea7edf1f66bb044ace1ce6b4c1698acd1b05353c97ba4864ea2e17", size = 75736, upload_time = "2025-02-15T05:16:59.017Z" }, + { url = "https://files.pythonhosted.org/packages/c4/03/0f453a27877cb5a5fff16a975925f4119102cc8552f52536b9a98ef0431e/simplejson-3.20.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:71e849e7ceb2178344998cbe5ade101f1b329460243c79c27fbfc51c0447a7c3", size = 93109, upload_time = "2025-02-15T05:17:00.377Z" }, + { url = "https://files.pythonhosted.org/packages/74/1f/a729f4026850cabeaff23e134646c3f455e86925d2533463420635ae54de/simplejson-3.20.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b63fdbab29dc3868d6f009a59797cefaba315fd43cd32ddd998ee1da28e50e29", size = 75475, upload_time = "2025-02-15T05:17:02.544Z" }, + { url = "https://files.pythonhosted.org/packages/e2/14/50a2713fee8ff1f8d655b1a14f4a0f1c0c7246768a1b3b3d12964a4ed5aa/simplejson-3.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1190f9a3ce644fd50ec277ac4a98c0517f532cfebdcc4bd975c0979a9f05e1fb", size = 75112, upload_time = "2025-02-15T05:17:03.875Z" }, + { url = "https://files.pythonhosted.org/packages/45/86/ea9835abb646755140e2d482edc9bc1e91997ed19a59fd77ae4c6a0facea/simplejson-3.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1336ba7bcb722ad487cd265701ff0583c0bb6de638364ca947bb84ecc0015d1", size = 150245, upload_time = "2025-02-15T05:17:06.899Z" }, + { url = "https://files.pythonhosted.org/packages/12/b4/53084809faede45da829fe571c65fbda8479d2a5b9c633f46b74124d56f5/simplejson-3.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e975aac6a5acd8b510eba58d5591e10a03e3d16c1cf8a8624ca177491f7230f0", size = 158465, upload_time = "2025-02-15T05:17:08.707Z" }, + { url = "https://files.pythonhosted.org/packages/a9/7d/d56579468d1660b3841e1f21c14490d103e33cf911886b22652d6e9683ec/simplejson-3.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a6dd11ee282937ad749da6f3b8d87952ad585b26e5edfa10da3ae2536c73078", size = 148514, upload_time = "2025-02-15T05:17:11.323Z" }, + { url = "https://files.pythonhosted.org/packages/19/e3/874b1cca3d3897b486d3afdccc475eb3a09815bf1015b01cf7fcb52a55f0/simplejson-3.20.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab980fcc446ab87ea0879edad41a5c28f2d86020014eb035cf5161e8de4474c6", size = 152262, upload_time = "2025-02-15T05:17:13.543Z" }, + { url = "https://files.pythonhosted.org/packages/32/84/f0fdb3625292d945c2bd13a814584603aebdb38cfbe5fe9be6b46fe598c4/simplejson-3.20.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f5aee2a4cb6b146bd17333ac623610f069f34e8f31d2f4f0c1a2186e50c594f0", size = 150164, upload_time = "2025-02-15T05:17:15.021Z" }, + { url = "https://files.pythonhosted.org/packages/95/51/6d625247224f01eaaeabace9aec75ac5603a42f8ebcce02c486fbda8b428/simplejson-3.20.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:652d8eecbb9a3b6461b21ec7cf11fd0acbab144e45e600c817ecf18e4580b99e", size = 151795, upload_time = "2025-02-15T05:17:16.542Z" }, + { url = "https://files.pythonhosted.org/packages/7f/d9/bb921df6b35be8412f519e58e86d1060fddf3ad401b783e4862e0a74c4c1/simplejson-3.20.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:8c09948f1a486a89251ee3a67c9f8c969b379f6ffff1a6064b41fea3bce0a112", size = 159027, upload_time = "2025-02-15T05:17:18.083Z" }, + { url = "https://files.pythonhosted.org/packages/03/c5/5950605e4ad023a6621cf4c931b29fd3d2a9c1f36be937230bfc83d7271d/simplejson-3.20.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cbbd7b215ad4fc6f058b5dd4c26ee5c59f72e031dfda3ac183d7968a99e4ca3a", size = 154380, upload_time = "2025-02-15T05:17:20.334Z" }, + { url = "https://files.pythonhosted.org/packages/66/ad/b74149557c5ec1e4e4d55758bda426f5d2ec0123cd01a53ae63b8de51fa3/simplejson-3.20.1-cp313-cp313-win32.whl", hash = "sha256:ae81e482476eaa088ef9d0120ae5345de924f23962c0c1e20abbdff597631f87", size = 74102, upload_time = "2025-02-15T05:17:22.475Z" }, + { url = "https://files.pythonhosted.org/packages/db/a9/25282fdd24493e1022f30b7f5cdf804255c007218b2bfaa655bd7ad34b2d/simplejson-3.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:1b9fd15853b90aec3b1739f4471efbf1ac05066a2c7041bf8db821bb73cd2ddc", size = 75736, upload_time = "2025-02-15T05:17:24.122Z" }, + { url = "https://files.pythonhosted.org/packages/4b/30/00f02a0a921556dd5a6db1ef2926a1bc7a8bbbfb1c49cfed68a275b8ab2b/simplejson-3.20.1-py3-none-any.whl", hash = "sha256:8a6c1bbac39fa4a79f83cbf1df6ccd8ff7069582a9fd8db1e52cea073bc2c697", size = 57121, upload_time = "2025-02-15T05:18:51.243Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload_time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload_time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload_time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload_time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload_time = "2025-04-20T18:50:08.518Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload_time = "2025-04-20T18:50:07.196Z" }, +] + +[[package]] +name = "sqlalchemy" +version = "2.0.40" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet", marker = "(python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64')" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/c3/3f2bfa5e4dcd9938405fe2fab5b6ab94a9248a4f9536ea2fd497da20525f/sqlalchemy-2.0.40.tar.gz", hash = "sha256:d827099289c64589418ebbcaead0145cd19f4e3e8a93919a0100247af245fa00", size = 9664299, upload_time = "2025-03-27T17:52:31.876Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/7e/55044a9ec48c3249bb38d5faae93f09579c35e862bb318ebd1ed7a1994a5/sqlalchemy-2.0.40-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f6bacab7514de6146a1976bc56e1545bee247242fab030b89e5f70336fc0003e", size = 2114025, upload_time = "2025-03-27T18:49:29.456Z" }, + { url = "https://files.pythonhosted.org/packages/77/0f/dcf7bba95f847aec72f638750747b12d37914f71c8cc7c133cf326ab945c/sqlalchemy-2.0.40-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5654d1ac34e922b6c5711631f2da497d3a7bffd6f9f87ac23b35feea56098011", size = 2104419, upload_time = "2025-03-27T18:49:30.75Z" }, + { url = "https://files.pythonhosted.org/packages/75/70/c86a5c20715e4fe903dde4c2fd44fc7e7a0d5fb52c1b954d98526f65a3ea/sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35904d63412db21088739510216e9349e335f142ce4a04b69e2528020ee19ed4", size = 3222720, upload_time = "2025-03-27T18:44:29.871Z" }, + { url = "https://files.pythonhosted.org/packages/12/cf/b891a8c1d0c27ce9163361664c2128c7a57de3f35000ea5202eb3a2917b7/sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c7a80ed86d6aaacb8160a1caef6680d4ddd03c944d985aecee940d168c411d1", size = 3222682, upload_time = "2025-03-27T18:55:20.097Z" }, + { url = "https://files.pythonhosted.org/packages/15/3f/7709d8c8266953d945435a96b7f425ae4172a336963756b58e996fbef7f3/sqlalchemy-2.0.40-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:519624685a51525ddaa7d8ba8265a1540442a2ec71476f0e75241eb8263d6f51", size = 3159542, upload_time = "2025-03-27T18:44:31.333Z" }, + { url = "https://files.pythonhosted.org/packages/85/7e/717eaabaf0f80a0132dc2032ea8f745b7a0914451c984821a7c8737fb75a/sqlalchemy-2.0.40-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2ee5f9999a5b0e9689bed96e60ee53c3384f1a05c2dd8068cc2e8361b0df5b7a", size = 3179864, upload_time = "2025-03-27T18:55:21.784Z" }, + { url = "https://files.pythonhosted.org/packages/e4/cc/03eb5dfcdb575cbecd2bd82487b9848f250a4b6ecfb4707e834b4ce4ec07/sqlalchemy-2.0.40-cp311-cp311-win32.whl", hash = "sha256:c0cae71e20e3c02c52f6b9e9722bca70e4a90a466d59477822739dc31ac18b4b", size = 2084675, upload_time = "2025-03-27T18:48:55.915Z" }, + { url = "https://files.pythonhosted.org/packages/9a/48/440946bf9dc4dc231f4f31ef0d316f7135bf41d4b86aaba0c0655150d370/sqlalchemy-2.0.40-cp311-cp311-win_amd64.whl", hash = "sha256:574aea2c54d8f1dd1699449f332c7d9b71c339e04ae50163a3eb5ce4c4325ee4", size = 2110099, upload_time = "2025-03-27T18:48:57.45Z" }, + { url = "https://files.pythonhosted.org/packages/92/06/552c1f92e880b57d8b92ce6619bd569b25cead492389b1d84904b55989d8/sqlalchemy-2.0.40-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9d3b31d0a1c44b74d3ae27a3de422dfccd2b8f0b75e51ecb2faa2bf65ab1ba0d", size = 2112620, upload_time = "2025-03-27T18:40:00.071Z" }, + { url = "https://files.pythonhosted.org/packages/01/72/a5bc6e76c34cebc071f758161dbe1453de8815ae6e662393910d3be6d70d/sqlalchemy-2.0.40-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37f7a0f506cf78c80450ed1e816978643d3969f99c4ac6b01104a6fe95c5490a", size = 2103004, upload_time = "2025-03-27T18:40:04.204Z" }, + { url = "https://files.pythonhosted.org/packages/bf/fd/0e96c8e6767618ed1a06e4d7a167fe13734c2f8113c4cb704443e6783038/sqlalchemy-2.0.40-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bb933a650323e476a2e4fbef8997a10d0003d4da996aad3fd7873e962fdde4d", size = 3252440, upload_time = "2025-03-27T18:51:25.624Z" }, + { url = "https://files.pythonhosted.org/packages/cd/6a/eb82e45b15a64266a2917a6833b51a334ea3c1991728fd905bfccbf5cf63/sqlalchemy-2.0.40-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6959738971b4745eea16f818a2cd086fb35081383b078272c35ece2b07012716", size = 3263277, upload_time = "2025-03-27T18:50:28.142Z" }, + { url = "https://files.pythonhosted.org/packages/45/97/ebe41ab4530f50af99e3995ebd4e0204bf1b0dc0930f32250dde19c389fe/sqlalchemy-2.0.40-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:110179728e442dae85dd39591beb74072ae4ad55a44eda2acc6ec98ead80d5f2", size = 3198591, upload_time = "2025-03-27T18:51:27.543Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1c/a569c1b2b2f5ac20ba6846a1321a2bf52e9a4061001f282bf1c5528dcd69/sqlalchemy-2.0.40-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8040680eaacdce4d635f12c55c714f3d4c7f57da2bc47a01229d115bd319191", size = 3225199, upload_time = "2025-03-27T18:50:30.069Z" }, + { url = "https://files.pythonhosted.org/packages/8f/91/87cc71a6b10065ca0209d19a4bb575378abda6085e72fa0b61ffb2201b84/sqlalchemy-2.0.40-cp312-cp312-win32.whl", hash = "sha256:650490653b110905c10adac69408380688cefc1f536a137d0d69aca1069dc1d1", size = 2082959, upload_time = "2025-03-27T18:45:57.574Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/14c511cda174aa1ad9b0e42b64ff5a71db35d08b0d80dc044dae958921e5/sqlalchemy-2.0.40-cp312-cp312-win_amd64.whl", hash = "sha256:2be94d75ee06548d2fc591a3513422b873490efb124048f50556369a834853b0", size = 2108526, upload_time = "2025-03-27T18:45:58.965Z" }, + { url = "https://files.pythonhosted.org/packages/8c/18/4e3a86cc0232377bc48c373a9ba6a1b3fb79ba32dbb4eda0b357f5a2c59d/sqlalchemy-2.0.40-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:915866fd50dd868fdcc18d61d8258db1bf9ed7fbd6dfec960ba43365952f3b01", size = 2107887, upload_time = "2025-03-27T18:40:05.461Z" }, + { url = "https://files.pythonhosted.org/packages/cb/60/9fa692b1d2ffc4cbd5f47753731fd332afed30137115d862d6e9a1e962c7/sqlalchemy-2.0.40-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a4c5a2905a9ccdc67a8963e24abd2f7afcd4348829412483695c59e0af9a705", size = 2098367, upload_time = "2025-03-27T18:40:07.182Z" }, + { url = "https://files.pythonhosted.org/packages/4c/9f/84b78357ca641714a439eb3fbbddb17297dacfa05d951dbf24f28d7b5c08/sqlalchemy-2.0.40-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55028d7a3ebdf7ace492fab9895cbc5270153f75442a0472d8516e03159ab364", size = 3184806, upload_time = "2025-03-27T18:51:29.356Z" }, + { url = "https://files.pythonhosted.org/packages/4b/7d/e06164161b6bfce04c01bfa01518a20cccbd4100d5c951e5a7422189191a/sqlalchemy-2.0.40-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cfedff6878b0e0d1d0a50666a817ecd85051d12d56b43d9d425455e608b5ba0", size = 3198131, upload_time = "2025-03-27T18:50:31.616Z" }, + { url = "https://files.pythonhosted.org/packages/6d/51/354af20da42d7ec7b5c9de99edafbb7663a1d75686d1999ceb2c15811302/sqlalchemy-2.0.40-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bb19e30fdae77d357ce92192a3504579abe48a66877f476880238a962e5b96db", size = 3131364, upload_time = "2025-03-27T18:51:31.336Z" }, + { url = "https://files.pythonhosted.org/packages/7a/2f/48a41ff4e6e10549d83fcc551ab85c268bde7c03cf77afb36303c6594d11/sqlalchemy-2.0.40-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:16d325ea898f74b26ffcd1cf8c593b0beed8714f0317df2bed0d8d1de05a8f26", size = 3159482, upload_time = "2025-03-27T18:50:33.201Z" }, + { url = "https://files.pythonhosted.org/packages/33/ac/e5e0a807163652a35be878c0ad5cfd8b1d29605edcadfb5df3c512cdf9f3/sqlalchemy-2.0.40-cp313-cp313-win32.whl", hash = "sha256:a669cbe5be3c63f75bcbee0b266779706f1a54bcb1000f302685b87d1b8c1500", size = 2080704, upload_time = "2025-03-27T18:46:00.193Z" }, + { url = "https://files.pythonhosted.org/packages/1c/cb/f38c61f7f2fd4d10494c1c135ff6a6ddb63508d0b47bccccd93670637309/sqlalchemy-2.0.40-cp313-cp313-win_amd64.whl", hash = "sha256:641ee2e0834812d657862f3a7de95e0048bdcb6c55496f39c6fa3d435f6ac6ad", size = 2104564, upload_time = "2025-03-27T18:46:01.442Z" }, + { url = "https://files.pythonhosted.org/packages/d1/7c/5fc8e802e7506fe8b55a03a2e1dab156eae205c91bee46305755e086d2e2/sqlalchemy-2.0.40-py3-none-any.whl", hash = "sha256:32587e2e1e359276957e6fe5dad089758bc042a971a8a09ae8ecf7a8fe23d07a", size = 1903894, upload_time = "2025-03-27T18:40:43.796Z" }, +] + +[[package]] +name = "starlette" +version = "0.45.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/fb/2984a686808b89a6781526129a4b51266f678b2d2b97ab2d325e56116df8/starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f", size = 2574076, upload_time = "2025-01-24T11:17:36.535Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/61/f2b52e107b1fc8944b33ef56bf6ac4ebbe16d91b94d2b87ce013bf63fb84/starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d", size = 71507, upload_time = "2025-01-24T11:17:34.182Z" }, +] + +[[package]] +name = "tensorboard" +version = "2.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "absl-py" }, + { name = "google-auth" }, + { name = "google-auth-oauthlib" }, + { name = "grpcio" }, + { name = "markdown" }, + { name = "numpy" }, + { name = "protobuf" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "six" }, + { name = "tensorboard-data-server" }, + { name = "werkzeug" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/12/f6e9b9dcc310263cbd3948274e286538bd6800fd0c268850788f14a0c6d0/tensorboard-2.15.2-py3-none-any.whl", hash = "sha256:a6f6443728064d962caea6d34653e220e34ef8df764cb06a8212c17e1a8f0622", size = 5539713, upload_time = "2024-02-09T10:39:25.636Z" }, +] + +[[package]] +name = "tensorboard-data-server" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356, upload_time = "2023-10-23T21:23:32.16Z" }, + { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598, upload_time = "2023-10-23T21:23:33.714Z" }, + { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload_time = "2023-10-23T21:23:35.583Z" }, +] + +[[package]] +name = "tensorflow-cpu" +version = "2.15.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "absl-py" }, + { name = "astunparse" }, + { name = "flatbuffers" }, + { name = "gast" }, + { name = "google-pasta" }, + { name = "grpcio" }, + { name = "h5py" }, + { name = "keras" }, + { name = "libclang" }, + { name = "ml-dtypes" }, + { name = "numpy" }, + { name = "opt-einsum" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "setuptools" }, + { name = "six" }, + { name = "tensorboard" }, + { name = "tensorflow-estimator" }, + { name = "tensorflow-io-gcs-filesystem" }, + { name = "termcolor" }, + { name = "typing-extensions" }, + { name = "wrapt" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/6c/dc0642ce2656637d8f31ba9c618a41bf14e38428ba77e4e0a9359be39436/tensorflow_cpu-2.15.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:ee3bb114c6031d471d891c761e7eda2c80bea19bb318abcd3d5bab92ccfaf9aa", size = 236482774, upload_time = "2024-03-08T23:52:40.963Z" }, + { url = "https://files.pythonhosted.org/packages/5b/00/af89cb211fc96ffdebb52a687dad7f83b0b1d82bc057f65309fa03a89911/tensorflow_cpu-2.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54660c074d7241a503e81edfd9f5ef5af88f64051b72e2945f26318c790f2d26", size = 207208420, upload_time = "2024-03-08T23:48:30.479Z" }, + { url = "https://files.pythonhosted.org/packages/51/8a/ff2fc9bad8edc68ef4cd63963c10b320de03d3496def83d2a9b1635c6831/tensorflow_cpu-2.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:dc75baf4c08a6e8ab7ceec97f002bb993508a5b58f13fac5283ee976a71a3c67", size = 2133, upload_time = "2024-03-08T23:53:47.249Z" }, +] + +[[package]] +name = "tensorflow-estimator" +version = "2.15.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/c8/2f823c8958d5342eafc6dd3e922f0cc4fcf8c2e0460284cc462dae3b60a0/tensorflow_estimator-2.15.0-py2.py3-none-any.whl", hash = "sha256:aedf21eec7fb2dc91150fc91a1ce12bc44dbb72278a08b58e79ff87c9e28f153", size = 441974, upload_time = "2023-11-07T01:10:10.812Z" }, +] + +[[package]] +name = "tensorflow-io-gcs-filesystem" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/00/900ca310ff2e46eb3127f8f54af0b0000a6cc786be6a54dc2cfe841f4683/tensorflow_io_gcs_filesystem-0.31.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8909c4344b0e96aa356230ab460ffafe5900c33c1aaced65fafae71d177a1966", size = 1642401, upload_time = "2023-02-25T19:31:40.204Z" }, + { url = "https://files.pythonhosted.org/packages/e7/c4/0d44ef93add3432ce43f37fe0c205cc7b6fd685fca80054fb4a646a9dbe3/tensorflow_io_gcs_filesystem-0.31.0-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e417faf8755aafe52d8f8c6b5ae5bae6e4fae8326ee3acd5e9181b83bbfbae87", size = 2381673, upload_time = "2023-02-25T19:31:41.992Z" }, + { url = "https://files.pythonhosted.org/packages/a8/2b/3064195efa016fff942009fe965ecbbbbd7d70bf34ee22d4ff31a0f3443a/tensorflow_io_gcs_filesystem-0.31.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c40e3c4ee1f8dda3b545deea6b8839192c82037d8021db9f589908034ad975", size = 2572150, upload_time = "2023-02-25T19:31:43.874Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4e/9566a313927be582ca99455a9523a097c7888fc819695bdc08415432b202/tensorflow_io_gcs_filesystem-0.31.0-cp311-cp311-win_amd64.whl", hash = "sha256:4bb37d23f21c434687b11059cb7ffd094d52a7813368915ba1b7057e3c16e414", size = 1486315, upload_time = "2023-02-25T19:31:45.641Z" }, +] + +[[package]] +name = "termcolor" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/6c/3d75c196ac07ac8749600b60b03f4f6094d54e132c4d94ebac6ee0e0add0/termcolor-3.1.0.tar.gz", hash = "sha256:6a6dd7fbee581909eeec6a756cff1d7f7c376063b14e4a298dc4980309e55970", size = 14324, upload_time = "2025-04-30T11:37:53.791Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4f/bd/de8d508070629b6d84a30d01d57e4a65c69aa7f5abe7560b8fad3b50ea59/termcolor-3.1.0-py3-none-any.whl", hash = "sha256:591dd26b5c2ce03b9e43f391264626557873ce1d379019786f99b0c2bee140aa", size = 7684, upload_time = "2025-04-30T11:37:52.382Z" }, +] + +[[package]] +name = "text-unidecode" +version = "1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ab/e2/e9a00f0ccb71718418230718b3d900e71a5d16e701a3dae079a21e9cd8f8/text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93", size = 76885, upload_time = "2019-08-30T21:36:45.405Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload_time = "2019-08-30T21:37:03.543Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.19.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/48/04/2071c150f374aab6d5e92aaec38d0f3c368d227dd9e0469a1f0966ac68d1/tokenizers-0.19.1.tar.gz", hash = "sha256:ee59e6680ed0fdbe6b724cf38bd70400a0c1dd623b07ac729087270caeac88e3", size = 321039, upload_time = "2024-04-17T21:40:41.849Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/d6/6e1d728d765eb4102767f071bf7f6439ab10d7f4a975c9217db65715207a/tokenizers-0.19.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5c88d1481f1882c2e53e6bb06491e474e420d9ac7bdff172610c4f9ad3898059", size = 2533448, upload_time = "2024-04-17T21:36:38.61Z" }, + { url = "https://files.pythonhosted.org/packages/90/79/d17a0f491d10817cd30f1121a07aa09c8e97a81114b116e473baf1577f09/tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddf672ed719b4ed82b51499100f5417d7d9f6fb05a65e232249268f35de5ed14", size = 2440254, upload_time = "2024-04-17T21:36:40.398Z" }, + { url = "https://files.pythonhosted.org/packages/c7/28/2d11c3ff94f9d42eceb2ea549a06e3f166fe391c5a025e5d96fac898a3ac/tokenizers-0.19.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dadc509cc8a9fe460bd274c0e16ac4184d0958117cf026e0ea8b32b438171594", size = 3684971, upload_time = "2024-04-17T21:36:43.115Z" }, + { url = "https://files.pythonhosted.org/packages/36/c6/537f22b57e6003904d35d07962dbde2f2e9bdd791d0241da976a4c7f8194/tokenizers-0.19.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfedf31824ca4915b511b03441784ff640378191918264268e6923da48104acc", size = 3568894, upload_time = "2024-04-17T21:36:45.011Z" }, + { url = "https://files.pythonhosted.org/packages/af/ef/3c1deed14ec59b2c8e7e2fa27b2a53f7d101181277a43b89ab17d891ef2e/tokenizers-0.19.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac11016d0a04aa6487b1513a3a36e7bee7eec0e5d30057c9c0408067345c48d2", size = 3426873, upload_time = "2024-04-17T21:36:47.001Z" }, + { url = "https://files.pythonhosted.org/packages/06/db/c0320c4798ac6bd12d2ef895bec9d10d216a3b4d6fff10e9d68883ea7edc/tokenizers-0.19.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76951121890fea8330d3a0df9a954b3f2a37e3ec20e5b0530e9a0044ca2e11fe", size = 3965050, upload_time = "2024-04-17T21:36:49.202Z" }, + { url = "https://files.pythonhosted.org/packages/4c/8a/a166888d6cb14db55f5eb7ce0b1d4777d145aa27cbf4f945712cf6c29935/tokenizers-0.19.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b342d2ce8fc8d00f376af068e3274e2e8649562e3bc6ae4a67784ded6b99428d", size = 4047855, upload_time = "2024-04-17T21:36:52.864Z" }, + { url = "https://files.pythonhosted.org/packages/a7/03/fb50fc03f86016b227a967c8d474f90230c885c0d18f78acdfda7a96ce56/tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d16ff18907f4909dca9b076b9c2d899114dd6abceeb074eca0c93e2353f943aa", size = 3608228, upload_time = "2024-04-17T21:36:55.7Z" }, + { url = "https://files.pythonhosted.org/packages/5b/cd/0385e1026e1e03732fd398e964792a3a8433918b166748c82507e014d748/tokenizers-0.19.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:706a37cc5332f85f26efbe2bdc9ef8a9b372b77e4645331a405073e4b3a8c1c6", size = 9633115, upload_time = "2024-04-17T21:36:58.299Z" }, + { url = "https://files.pythonhosted.org/packages/25/50/8f8ad0bbdaf09d04b15e6502d1fa1c653754ed7e016e4ae009726aa1a4e4/tokenizers-0.19.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16baac68651701364b0289979ecec728546133e8e8fe38f66fe48ad07996b88b", size = 9949062, upload_time = "2024-04-17T21:37:01.947Z" }, + { url = "https://files.pythonhosted.org/packages/db/11/31be66710f1d14526f3588a441efadeb184e1e68458067007b20ead03c59/tokenizers-0.19.1-cp311-none-win32.whl", hash = "sha256:9ed240c56b4403e22b9584ee37d87b8bfa14865134e3e1c3fb4b2c42fafd3256", size = 2041039, upload_time = "2024-04-17T21:37:05.607Z" }, + { url = "https://files.pythonhosted.org/packages/65/8e/6d7d72b28f22c422cff8beae10ac3c2e4376b9be721ef8167b7eecd1da62/tokenizers-0.19.1-cp311-none-win_amd64.whl", hash = "sha256:ad57d59341710b94a7d9dbea13f5c1e7d76fd8d9bcd944a7a6ab0b0da6e0cc66", size = 2220386, upload_time = "2024-04-17T21:37:08.295Z" }, + { url = "https://files.pythonhosted.org/packages/63/90/2890cd096898dcdb596ee172cde40c0f54a9cf43b0736aa260a5501252af/tokenizers-0.19.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:621d670e1b1c281a1c9698ed89451395d318802ff88d1fc1accff0867a06f153", size = 2530580, upload_time = "2024-04-17T21:37:10.688Z" }, + { url = "https://files.pythonhosted.org/packages/74/d1/f4e1e950adb36675dfd8f9d0f4be644f3f3aaf22a5677a4f5c81282b662e/tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d924204a3dbe50b75630bd16f821ebda6a5f729928df30f582fb5aade90c818a", size = 2436682, upload_time = "2024-04-17T21:37:12.966Z" }, + { url = "https://files.pythonhosted.org/packages/ed/30/89b321a16c58d233e301ec15072c0d3ed5014825e72da98604cd3ab2fba1/tokenizers-0.19.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4f3fefdc0446b1a1e6d81cd4c07088ac015665d2e812f6dbba4a06267d1a2c95", size = 3693494, upload_time = "2024-04-17T21:37:14.755Z" }, + { url = "https://files.pythonhosted.org/packages/05/40/fa899f32de483500fbc78befd378fd7afba4270f17db707d1a78c0a4ddc3/tokenizers-0.19.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9620b78e0b2d52ef07b0d428323fb34e8ea1219c5eac98c2596311f20f1f9266", size = 3566541, upload_time = "2024-04-17T21:37:17.067Z" }, + { url = "https://files.pythonhosted.org/packages/67/14/e7da32ae5fb4971830f1ef335932fae3fa57e76b537e852f146c850aefdf/tokenizers-0.19.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04ce49e82d100594715ac1b2ce87d1a36e61891a91de774755f743babcd0dd52", size = 3430792, upload_time = "2024-04-17T21:37:19.055Z" }, + { url = "https://files.pythonhosted.org/packages/f2/4b/aae61bdb6ab584d2612170801703982ee0e35f8b6adacbeefe5a3b277621/tokenizers-0.19.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5c2ff13d157afe413bf7e25789879dd463e5a4abfb529a2d8f8473d8042e28f", size = 3962812, upload_time = "2024-04-17T21:37:21.008Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/f7b7ef89c4da7b20256e6eab23d3835f05d1ca8f451d31c16cbfe3cd9eb6/tokenizers-0.19.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3174c76efd9d08f836bfccaca7cfec3f4d1c0a4cf3acbc7236ad577cc423c840", size = 4024688, upload_time = "2024-04-17T21:37:23.659Z" }, + { url = "https://files.pythonhosted.org/packages/80/54/12047a69f5b382d7ee72044dc89151a2dd0d13b2c9bdcc22654883704d31/tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9d5b6c0e7a1e979bec10ff960fae925e947aab95619a6fdb4c1d8ff3708ce3", size = 3610961, upload_time = "2024-04-17T21:37:26.234Z" }, + { url = "https://files.pythonhosted.org/packages/52/b7/1e8a913d18ac28feeda42d4d2d51781874398fb59cd1c1e2653a4b5742ed/tokenizers-0.19.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a179856d1caee06577220ebcfa332af046d576fb73454b8f4d4b0ba8324423ea", size = 9631367, upload_time = "2024-04-17T21:37:28.752Z" }, + { url = "https://files.pythonhosted.org/packages/ac/3d/2284f6d99f8f21d09352b88b8cfefa24ab88468d962aeb0aa15c20d76b32/tokenizers-0.19.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:952b80dac1a6492170f8c2429bd11fcaa14377e097d12a1dbe0ef2fb2241e16c", size = 9950121, upload_time = "2024-04-17T21:37:31.741Z" }, + { url = "https://files.pythonhosted.org/packages/2a/94/ec3369dbc9b7200c14c8c7a1a04c78b7a7398d0c001e1b7d1ffe30eb93a0/tokenizers-0.19.1-cp312-none-win32.whl", hash = "sha256:01d62812454c188306755c94755465505836fd616f75067abcae529c35edeb57", size = 2044069, upload_time = "2024-04-17T21:37:35.672Z" }, + { url = "https://files.pythonhosted.org/packages/0c/97/80bff6937e0c67d30c0facacd4f0bcf4254e581aa4995c73cef8c8640e56/tokenizers-0.19.1-cp312-none-win_amd64.whl", hash = "sha256:b70bfbe3a82d3e3fb2a5e9b22a39f8d1740c96c68b6ace0086b39074f08ab89a", size = 2214527, upload_time = "2024-04-17T21:37:39.19Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload_time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload_time = "2024-11-24T20:12:19.698Z" }, +] + +[[package]] +name = "transformers" +version = "4.40.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/ef/d877998c9ab04ecb8eeda495e1c64f2f6bb6724b0634f7d0d6aca2cdc6af/transformers-4.40.2.tar.gz", hash = "sha256:657b6054a2097671398d976ad46e60836e7e15f9ea9551631a96e33cb9240649", size = 7797669, upload_time = "2024-05-06T16:08:02.166Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/23/ba02efa28518557e0cfe0ce5c1170000dd7501ed02ac865fc90cbe3daa93/transformers-4.40.2-py3-none-any.whl", hash = "sha256:71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d", size = 8999918, upload_time = "2024-05-06T16:07:56.121Z" }, +] + +[[package]] +name = "typer" +version = "0.15.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/1a/5f36851f439884bcfe8539f6a20ff7516e7b60f319bbaf69a90dc35cc2eb/typer-0.15.3.tar.gz", hash = "sha256:818873625d0569653438316567861899f7e9972f2e6e0c16dab608345ced713c", size = 101641, upload_time = "2025-04-28T21:40:59.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/20/9d953de6f4367163d23ec823200eb3ecb0050a2609691e512c8b95827a9b/typer-0.15.3-py3-none-any.whl", hash = "sha256:c86a65ad77ca531f03de08d1b9cb67cd09ad02ddddf4b34745b5008f43b239bd", size = 45253, upload_time = "2025-04-28T21:40:56.269Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload_time = "2025-04-10T14:19:05.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload_time = "2025-04-10T14:19:03.967Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload_time = "2025-03-23T13:54:43.652Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" }, +] + +[[package]] +name = "tzlocal" +version = "5.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload_time = "2025-03-05T21:17:41.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload_time = "2025-03-05T21:17:39.857Z" }, +] + +[[package]] +name = "uritemplate" +version = "4.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d2/5a/4742fdba39cd02a56226815abfa72fe0aa81c33bed16ed045647d6000eba/uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", size = 273898, upload_time = "2021-10-13T11:15:14.84Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c0/7461b49cd25aeece13766f02ee576d1db528f1c37ce69aee300e075b485b/uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e", size = 10356, upload_time = "2021-10-13T11:15:12.316Z" }, +] + +[[package]] +name = "urllib3" +version = "1.26.20" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload_time = "2024-08-29T15:43:11.37Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload_time = "2024-08-29T15:43:08.921Z" }, +] + +[[package]] +name = "uvicorn" +version = "0.34.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/ae/9bbb19b9e1c450cf9ecaef06463e40234d98d95bf572fab11b4f19ae5ded/uvicorn-0.34.2.tar.gz", hash = "sha256:0e929828f6186353a80b58ea719861d2629d766293b6d19baf086ba31d4f3328", size = 76815, upload_time = "2025-04-19T06:02:50.101Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/4b/4cef6ce21a2aaca9d852a6e84ef4f135d99fcd74fa75105e2fc0c8308acd/uvicorn-0.34.2-py3-none-any.whl", hash = "sha256:deb49af569084536d269fe0a6d67e3754f104cf03aba7c11c40f01aadf33c403", size = 62483, upload_time = "2025-04-19T06:02:48.42Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "httptools" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" }, + { name = "watchfiles" }, + { name = "websockets" }, +] + +[[package]] +name = "uvloop" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/c0/854216d09d33c543f12a44b393c402e89a920b1a0a7dc634c42de91b9cf6/uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3", size = 2492741, upload_time = "2024-10-14T23:38:35.489Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/a7/4cf0334105c1160dd6819f3297f8700fda7fc30ab4f61fbf3e725acbc7cc/uvloop-0.21.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8", size = 1447410, upload_time = "2024-10-14T23:37:33.612Z" }, + { url = "https://files.pythonhosted.org/packages/8c/7c/1517b0bbc2dbe784b563d6ab54f2ef88c890fdad77232c98ed490aa07132/uvloop-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0", size = 805476, upload_time = "2024-10-14T23:37:36.11Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ea/0bfae1aceb82a503f358d8d2fa126ca9dbdb2ba9c7866974faec1cb5875c/uvloop-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e", size = 3960855, upload_time = "2024-10-14T23:37:37.683Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ca/0864176a649838b838f36d44bf31c451597ab363b60dc9e09c9630619d41/uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb", size = 3973185, upload_time = "2024-10-14T23:37:40.226Z" }, + { url = "https://files.pythonhosted.org/packages/30/bf/08ad29979a936d63787ba47a540de2132169f140d54aa25bc8c3df3e67f4/uvloop-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6", size = 3820256, upload_time = "2024-10-14T23:37:42.839Z" }, + { url = "https://files.pythonhosted.org/packages/da/e2/5cf6ef37e3daf2f06e651aae5ea108ad30df3cb269102678b61ebf1fdf42/uvloop-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d", size = 3937323, upload_time = "2024-10-14T23:37:45.337Z" }, + { url = "https://files.pythonhosted.org/packages/8c/4c/03f93178830dc7ce8b4cdee1d36770d2f5ebb6f3d37d354e061eefc73545/uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c", size = 1471284, upload_time = "2024-10-14T23:37:47.833Z" }, + { url = "https://files.pythonhosted.org/packages/43/3e/92c03f4d05e50f09251bd8b2b2b584a2a7f8fe600008bcc4523337abe676/uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2", size = 821349, upload_time = "2024-10-14T23:37:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ef/a02ec5da49909dbbfb1fd205a9a1ac4e88ea92dcae885e7c961847cd51e2/uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d", size = 4580089, upload_time = "2024-10-14T23:37:51.703Z" }, + { url = "https://files.pythonhosted.org/packages/06/a7/b4e6a19925c900be9f98bec0a75e6e8f79bb53bdeb891916609ab3958967/uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc", size = 4693770, upload_time = "2024-10-14T23:37:54.122Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0c/f07435a18a4b94ce6bd0677d8319cd3de61f3a9eeb1e5f8ab4e8b5edfcb3/uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb", size = 4451321, upload_time = "2024-10-14T23:37:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/8f/eb/f7032be105877bcf924709c97b1bf3b90255b4ec251f9340cef912559f28/uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f", size = 4659022, upload_time = "2024-10-14T23:37:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8d/2cbef610ca21539f0f36e2b34da49302029e7c9f09acef0b1c3b5839412b/uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281", size = 1468123, upload_time = "2024-10-14T23:38:00.688Z" }, + { url = "https://files.pythonhosted.org/packages/93/0d/b0038d5a469f94ed8f2b2fce2434a18396d8fbfb5da85a0a9781ebbdec14/uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af", size = 819325, upload_time = "2024-10-14T23:38:02.309Z" }, + { url = "https://files.pythonhosted.org/packages/50/94/0a687f39e78c4c1e02e3272c6b2ccdb4e0085fda3b8352fecd0410ccf915/uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6", size = 4582806, upload_time = "2024-10-14T23:38:04.711Z" }, + { url = "https://files.pythonhosted.org/packages/d2/19/f5b78616566ea68edd42aacaf645adbf71fbd83fc52281fba555dc27e3f1/uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816", size = 4701068, upload_time = "2024-10-14T23:38:06.385Z" }, + { url = "https://files.pythonhosted.org/packages/47/57/66f061ee118f413cd22a656de622925097170b9380b30091b78ea0c6ea75/uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc", size = 4454428, upload_time = "2024-10-14T23:38:08.416Z" }, + { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload_time = "2024-10-14T23:38:10.888Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/e2/8ed598c42057de7aa5d97c472254af4906ff0a59a66699d426fc9ef795d7/watchfiles-1.0.5.tar.gz", hash = "sha256:b7529b5dcc114679d43827d8c35a07c493ad6f083633d573d81c660abc5979e9", size = 94537, upload_time = "2025-04-08T10:36:26.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/f4/41b591f59021786ef517e1cdc3b510383551846703e03f204827854a96f8/watchfiles-1.0.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:237f9be419e977a0f8f6b2e7b0475ababe78ff1ab06822df95d914a945eac827", size = 405336, upload_time = "2025-04-08T10:34:59.359Z" }, + { url = "https://files.pythonhosted.org/packages/ae/06/93789c135be4d6d0e4f63e96eea56dc54050b243eacc28439a26482b5235/watchfiles-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0da39ff917af8b27a4bdc5a97ac577552a38aac0d260a859c1517ea3dc1a7c4", size = 395977, upload_time = "2025-04-08T10:35:00.522Z" }, + { url = "https://files.pythonhosted.org/packages/d2/db/1cd89bd83728ca37054512d4d35ab69b5f12b8aa2ac9be3b0276b3bf06cc/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cfcb3952350e95603f232a7a15f6c5f86c5375e46f0bd4ae70d43e3e063c13d", size = 455232, upload_time = "2025-04-08T10:35:01.698Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/d8a4d44ffe960517e487c9c04f77b06b8abf05eb680bed71c82b5f2cad62/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:68b2dddba7a4e6151384e252a5632efcaa9bc5d1c4b567f3cb621306b2ca9f63", size = 459151, upload_time = "2025-04-08T10:35:03.358Z" }, + { url = "https://files.pythonhosted.org/packages/6c/da/267a1546f26465dead1719caaba3ce660657f83c9d9c052ba98fb8856e13/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95cf944fcfc394c5f9de794ce581914900f82ff1f855326f25ebcf24d5397418", size = 489054, upload_time = "2025-04-08T10:35:04.561Z" }, + { url = "https://files.pythonhosted.org/packages/b1/31/33850dfd5c6efb6f27d2465cc4c6b27c5a6f5ed53c6fa63b7263cf5f60f6/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecf6cd9f83d7c023b1aba15d13f705ca7b7d38675c121f3cc4a6e25bd0857ee9", size = 523955, upload_time = "2025-04-08T10:35:05.786Z" }, + { url = "https://files.pythonhosted.org/packages/09/84/b7d7b67856efb183a421f1416b44ca975cb2ea6c4544827955dfb01f7dc2/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:852de68acd6212cd6d33edf21e6f9e56e5d98c6add46f48244bd479d97c967c6", size = 502234, upload_time = "2025-04-08T10:35:07.187Z" }, + { url = "https://files.pythonhosted.org/packages/71/87/6dc5ec6882a2254cfdd8b0718b684504e737273903b65d7338efaba08b52/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5730f3aa35e646103b53389d5bc77edfbf578ab6dab2e005142b5b80a35ef25", size = 454750, upload_time = "2025-04-08T10:35:08.859Z" }, + { url = "https://files.pythonhosted.org/packages/3d/6c/3786c50213451a0ad15170d091570d4a6554976cf0df19878002fc96075a/watchfiles-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:18b3bd29954bc4abeeb4e9d9cf0b30227f0f206c86657674f544cb032296acd5", size = 631591, upload_time = "2025-04-08T10:35:10.64Z" }, + { url = "https://files.pythonhosted.org/packages/1b/b3/1427425ade4e359a0deacce01a47a26024b2ccdb53098f9d64d497f6684c/watchfiles-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ba5552a1b07c8edbf197055bc9d518b8f0d98a1c6a73a293bc0726dce068ed01", size = 625370, upload_time = "2025-04-08T10:35:12.412Z" }, + { url = "https://files.pythonhosted.org/packages/15/ba/f60e053b0b5b8145d682672024aa91370a29c5c921a88977eb565de34086/watchfiles-1.0.5-cp311-cp311-win32.whl", hash = "sha256:2f1fefb2e90e89959447bc0420fddd1e76f625784340d64a2f7d5983ef9ad246", size = 277791, upload_time = "2025-04-08T10:35:13.719Z" }, + { url = "https://files.pythonhosted.org/packages/50/ed/7603c4e164225c12c0d4e8700b64bb00e01a6c4eeea372292a3856be33a4/watchfiles-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:b6e76ceb1dd18c8e29c73f47d41866972e891fc4cc7ba014f487def72c1cf096", size = 291622, upload_time = "2025-04-08T10:35:15.071Z" }, + { url = "https://files.pythonhosted.org/packages/a2/c2/99bb7c96b4450e36877fde33690ded286ff555b5a5c1d925855d556968a1/watchfiles-1.0.5-cp311-cp311-win_arm64.whl", hash = "sha256:266710eb6fddc1f5e51843c70e3bebfb0f5e77cf4f27129278c70554104d19ed", size = 283699, upload_time = "2025-04-08T10:35:16.732Z" }, + { url = "https://files.pythonhosted.org/packages/2a/8c/4f0b9bdb75a1bfbd9c78fad7d8854369283f74fe7cf03eb16be77054536d/watchfiles-1.0.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b5eb568c2aa6018e26da9e6c86f3ec3fd958cee7f0311b35c2630fa4217d17f2", size = 401511, upload_time = "2025-04-08T10:35:17.956Z" }, + { url = "https://files.pythonhosted.org/packages/dc/4e/7e15825def77f8bd359b6d3f379f0c9dac4eb09dd4ddd58fd7d14127179c/watchfiles-1.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0a04059f4923ce4e856b4b4e5e783a70f49d9663d22a4c3b3298165996d1377f", size = 392715, upload_time = "2025-04-08T10:35:19.202Z" }, + { url = "https://files.pythonhosted.org/packages/58/65/b72fb817518728e08de5840d5d38571466c1b4a3f724d190cec909ee6f3f/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e380c89983ce6e6fe2dd1e1921b9952fb4e6da882931abd1824c092ed495dec", size = 454138, upload_time = "2025-04-08T10:35:20.586Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a4/86833fd2ea2e50ae28989f5950b5c3f91022d67092bfec08f8300d8b347b/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fe43139b2c0fdc4a14d4f8d5b5d967f7a2777fd3d38ecf5b1ec669b0d7e43c21", size = 458592, upload_time = "2025-04-08T10:35:21.87Z" }, + { url = "https://files.pythonhosted.org/packages/38/7e/42cb8df8be9a37e50dd3a818816501cf7a20d635d76d6bd65aae3dbbff68/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee0822ce1b8a14fe5a066f93edd20aada932acfe348bede8aa2149f1a4489512", size = 487532, upload_time = "2025-04-08T10:35:23.143Z" }, + { url = "https://files.pythonhosted.org/packages/fc/fd/13d26721c85d7f3df6169d8b495fcac8ab0dc8f0945ebea8845de4681dab/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a0dbcb1c2d8f2ab6e0a81c6699b236932bd264d4cef1ac475858d16c403de74d", size = 522865, upload_time = "2025-04-08T10:35:24.702Z" }, + { url = "https://files.pythonhosted.org/packages/a1/0d/7f9ae243c04e96c5455d111e21b09087d0eeaf9a1369e13a01c7d3d82478/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2014a2b18ad3ca53b1f6c23f8cd94a18ce930c1837bd891262c182640eb40a6", size = 499887, upload_time = "2025-04-08T10:35:25.969Z" }, + { url = "https://files.pythonhosted.org/packages/8e/0f/a257766998e26aca4b3acf2ae97dff04b57071e991a510857d3799247c67/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f6ae86d5cb647bf58f9f655fcf577f713915a5d69057a0371bc257e2553234", size = 454498, upload_time = "2025-04-08T10:35:27.353Z" }, + { url = "https://files.pythonhosted.org/packages/81/79/8bf142575a03e0af9c3d5f8bcae911ee6683ae93a625d349d4ecf4c8f7df/watchfiles-1.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1a7bac2bde1d661fb31f4d4e8e539e178774b76db3c2c17c4bb3e960a5de07a2", size = 630663, upload_time = "2025-04-08T10:35:28.685Z" }, + { url = "https://files.pythonhosted.org/packages/f1/80/abe2e79f610e45c63a70d271caea90c49bbf93eb00fa947fa9b803a1d51f/watchfiles-1.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ab626da2fc1ac277bbf752446470b367f84b50295264d2d313e28dc4405d663", size = 625410, upload_time = "2025-04-08T10:35:30.42Z" }, + { url = "https://files.pythonhosted.org/packages/91/6f/bc7fbecb84a41a9069c2c6eb6319f7f7df113adf113e358c57fc1aff7ff5/watchfiles-1.0.5-cp312-cp312-win32.whl", hash = "sha256:9f4571a783914feda92018ef3901dab8caf5b029325b5fe4558c074582815249", size = 277965, upload_time = "2025-04-08T10:35:32.023Z" }, + { url = "https://files.pythonhosted.org/packages/99/a5/bf1c297ea6649ec59e935ab311f63d8af5faa8f0b86993e3282b984263e3/watchfiles-1.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:360a398c3a19672cf93527f7e8d8b60d8275119c5d900f2e184d32483117a705", size = 291693, upload_time = "2025-04-08T10:35:33.225Z" }, + { url = "https://files.pythonhosted.org/packages/7f/7b/fd01087cc21db5c47e5beae507b87965db341cce8a86f9eb12bf5219d4e0/watchfiles-1.0.5-cp312-cp312-win_arm64.whl", hash = "sha256:1a2902ede862969077b97523987c38db28abbe09fb19866e711485d9fbf0d417", size = 283287, upload_time = "2025-04-08T10:35:34.568Z" }, + { url = "https://files.pythonhosted.org/packages/c7/62/435766874b704f39b2fecd8395a29042db2b5ec4005bd34523415e9bd2e0/watchfiles-1.0.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0b289572c33a0deae62daa57e44a25b99b783e5f7aed81b314232b3d3c81a11d", size = 401531, upload_time = "2025-04-08T10:35:35.792Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a6/e52a02c05411b9cb02823e6797ef9bbba0bfaf1bb627da1634d44d8af833/watchfiles-1.0.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a056c2f692d65bf1e99c41045e3bdcaea3cb9e6b5a53dcaf60a5f3bd95fc9763", size = 392417, upload_time = "2025-04-08T10:35:37.048Z" }, + { url = "https://files.pythonhosted.org/packages/3f/53/c4af6819770455932144e0109d4854437769672d7ad897e76e8e1673435d/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9dca99744991fc9850d18015c4f0438865414e50069670f5f7eee08340d8b40", size = 453423, upload_time = "2025-04-08T10:35:38.357Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d1/8e88df58bbbf819b8bc5cfbacd3c79e01b40261cad0fc84d1e1ebd778a07/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:894342d61d355446d02cd3988a7326af344143eb33a2fd5d38482a92072d9563", size = 458185, upload_time = "2025-04-08T10:35:39.708Z" }, + { url = "https://files.pythonhosted.org/packages/ff/70/fffaa11962dd5429e47e478a18736d4e42bec42404f5ee3b92ef1b87ad60/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab44e1580924d1ffd7b3938e02716d5ad190441965138b4aa1d1f31ea0877f04", size = 486696, upload_time = "2025-04-08T10:35:41.469Z" }, + { url = "https://files.pythonhosted.org/packages/39/db/723c0328e8b3692d53eb273797d9a08be6ffb1d16f1c0ba2bdbdc2a3852c/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6f9367b132078b2ceb8d066ff6c93a970a18c3029cea37bfd7b2d3dd2e5db8f", size = 522327, upload_time = "2025-04-08T10:35:43.289Z" }, + { url = "https://files.pythonhosted.org/packages/cd/05/9fccc43c50c39a76b68343484b9da7b12d42d0859c37c61aec018c967a32/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2e55a9b162e06e3f862fb61e399fe9f05d908d019d87bf5b496a04ef18a970a", size = 499741, upload_time = "2025-04-08T10:35:44.574Z" }, + { url = "https://files.pythonhosted.org/packages/23/14/499e90c37fa518976782b10a18b18db9f55ea73ca14641615056f8194bb3/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0125f91f70e0732a9f8ee01e49515c35d38ba48db507a50c5bdcad9503af5827", size = 453995, upload_time = "2025-04-08T10:35:46.336Z" }, + { url = "https://files.pythonhosted.org/packages/61/d9/f75d6840059320df5adecd2c687fbc18960a7f97b55c300d20f207d48aef/watchfiles-1.0.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:13bb21f8ba3248386337c9fa51c528868e6c34a707f729ab041c846d52a0c69a", size = 629693, upload_time = "2025-04-08T10:35:48.161Z" }, + { url = "https://files.pythonhosted.org/packages/fc/17/180ca383f5061b61406477218c55d66ec118e6c0c51f02d8142895fcf0a9/watchfiles-1.0.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:839ebd0df4a18c5b3c1b890145b5a3f5f64063c2a0d02b13c76d78fe5de34936", size = 624677, upload_time = "2025-04-08T10:35:49.65Z" }, + { url = "https://files.pythonhosted.org/packages/bf/15/714d6ef307f803f236d69ee9d421763707899d6298d9f3183e55e366d9af/watchfiles-1.0.5-cp313-cp313-win32.whl", hash = "sha256:4a8ec1e4e16e2d5bafc9ba82f7aaecfeec990ca7cd27e84fb6f191804ed2fcfc", size = 277804, upload_time = "2025-04-08T10:35:51.093Z" }, + { url = "https://files.pythonhosted.org/packages/a8/b4/c57b99518fadf431f3ef47a610839e46e5f8abf9814f969859d1c65c02c7/watchfiles-1.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:f436601594f15bf406518af922a89dcaab416568edb6f65c4e5bbbad1ea45c11", size = 291087, upload_time = "2025-04-08T10:35:52.458Z" }, +] + +[[package]] +name = "websockets" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload_time = "2025-03-05T20:03:41.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload_time = "2025-03-05T20:01:56.276Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload_time = "2025-03-05T20:01:57.563Z" }, + { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload_time = "2025-03-05T20:01:59.063Z" }, + { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload_time = "2025-03-05T20:02:00.305Z" }, + { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload_time = "2025-03-05T20:02:03.148Z" }, + { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload_time = "2025-03-05T20:02:05.29Z" }, + { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload_time = "2025-03-05T20:02:07.458Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload_time = "2025-03-05T20:02:09.842Z" }, + { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload_time = "2025-03-05T20:02:11.968Z" }, + { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload_time = "2025-03-05T20:02:13.32Z" }, + { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload_time = "2025-03-05T20:02:14.585Z" }, + { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload_time = "2025-03-05T20:02:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload_time = "2025-03-05T20:02:18.832Z" }, + { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload_time = "2025-03-05T20:02:20.187Z" }, + { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload_time = "2025-03-05T20:02:22.286Z" }, + { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload_time = "2025-03-05T20:02:24.368Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload_time = "2025-03-05T20:02:25.669Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload_time = "2025-03-05T20:02:26.99Z" }, + { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload_time = "2025-03-05T20:02:30.291Z" }, + { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload_time = "2025-03-05T20:02:31.634Z" }, + { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload_time = "2025-03-05T20:02:33.017Z" }, + { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload_time = "2025-03-05T20:02:34.498Z" }, + { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload_time = "2025-03-05T20:02:36.695Z" }, + { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload_time = "2025-03-05T20:02:37.985Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload_time = "2025-03-05T20:02:39.298Z" }, + { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload_time = "2025-03-05T20:02:40.595Z" }, + { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload_time = "2025-03-05T20:02:41.926Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload_time = "2025-03-05T20:02:43.304Z" }, + { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload_time = "2025-03-05T20:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload_time = "2025-03-05T20:02:50.14Z" }, + { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload_time = "2025-03-05T20:02:51.561Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload_time = "2025-03-05T20:02:53.814Z" }, + { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload_time = "2025-03-05T20:02:55.237Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload_time = "2025-03-05T20:03:39.41Z" }, +] + +[[package]] +name = "werkzeug" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload_time = "2024-11-08T15:52:18.093Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload_time = "2024-11-08T15:52:16.132Z" }, +] + +[[package]] +name = "wheel" +version = "0.45.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef809ae6338005b3f21bb568bea3165cfc6a243fdc25c/wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729", size = 107545, upload_time = "2024-11-23T00:18:23.513Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload_time = "2024-11-23T00:18:21.207Z" }, +] + +[[package]] +name = "wrapt" +version = "1.14.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/eb/e06e77394d6cf09977d92bff310cb0392930c08a338f99af6066a5a98f92/wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d", size = 50890, upload_time = "2022-05-02T05:28:31.026Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/f9/8c078b4973604cd968b23eb3dff52028b5c48f2a02c4f1f975f4d5e344d1/wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55", size = 35432, upload_time = "2023-10-07T08:29:58.387Z" }, + { url = "https://files.pythonhosted.org/packages/6e/79/aec8185eefe20e8f49e5adeb0c2e20e016d5916d10872c17705ddac41be2/wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9", size = 36219, upload_time = "2023-10-07T08:30:01.249Z" }, + { url = "https://files.pythonhosted.org/packages/d1/71/8d68004e5d5a676177342a56808af51e1df3b0e54b203e3295a8cd96b53b/wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335", size = 78509, upload_time = "2023-10-07T08:30:03.544Z" }, + { url = "https://files.pythonhosted.org/packages/5a/27/604d6ad71fe5935446df1b7512d491b47fe2aef8c95e9813d03d78024a28/wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9", size = 70972, upload_time = "2023-10-07T08:30:05.619Z" }, + { url = "https://files.pythonhosted.org/packages/7f/1b/e0439eec0db6520968c751bc7e12480bb80bb8d939190e0e55ed762f3c7a/wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8", size = 78402, upload_time = "2023-10-07T08:30:07.408Z" }, + { url = "https://files.pythonhosted.org/packages/b9/45/2cc612ff64061d4416baf8d0daf27bea7f79f0097638ddc2af51a3e647f3/wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf", size = 83373, upload_time = "2023-10-07T08:30:09.317Z" }, + { url = "https://files.pythonhosted.org/packages/ad/b7/332692b8d0387922da0f1323ad36a14e365911def3c78ea0d102f83ac592/wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a", size = 76299, upload_time = "2023-10-07T08:30:10.723Z" }, + { url = "https://files.pythonhosted.org/packages/f2/31/cbce966b6760e62d005c237961e839a755bf0c907199248394e2ee03ab05/wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be", size = 83361, upload_time = "2023-10-07T08:30:11.98Z" }, + { url = "https://files.pythonhosted.org/packages/9a/aa/ab46fb18072b86e87e0965a402f8723217e8c0312d1b3e2a91308df924ab/wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204", size = 33454, upload_time = "2023-10-07T08:30:13.513Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7e/14113996bc6ee68eb987773b4139c87afd3ceff60e27e37648aa5eb2798a/wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224", size = 35616, upload_time = "2023-10-07T08:30:14.868Z" }, +] + +[[package]] +name = "xxhash" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload_time = "2024-08-17T09:20:38.972Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload_time = "2024-08-17T09:18:00.852Z" }, + { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload_time = "2024-08-17T09:18:01.863Z" }, + { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload_time = "2024-08-17T09:18:03.461Z" }, + { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload_time = "2024-08-17T09:18:05.616Z" }, + { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload_time = "2024-08-17T09:18:06.957Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload_time = "2024-08-17T09:18:08.331Z" }, + { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload_time = "2024-08-17T09:18:10.332Z" }, + { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload_time = "2024-08-17T09:18:11.707Z" }, + { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload_time = "2024-08-17T09:18:13.799Z" }, + { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload_time = "2024-08-17T09:18:15.824Z" }, + { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload_time = "2024-08-17T09:18:17.142Z" }, + { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload_time = "2024-08-17T09:18:18.779Z" }, + { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload_time = "2024-08-17T09:18:20.009Z" }, + { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload_time = "2024-08-17T09:18:21.052Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload_time = "2024-08-17T09:18:22.809Z" }, + { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969, upload_time = "2024-08-17T09:18:24.025Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787, upload_time = "2024-08-17T09:18:25.318Z" }, + { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959, upload_time = "2024-08-17T09:18:26.518Z" }, + { url = "https://files.pythonhosted.org/packages/fe/86/51258d3e8a8545ff26468c977101964c14d56a8a37f5835bc0082426c672/xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793", size = 200006, upload_time = "2024-08-17T09:18:27.905Z" }, + { url = "https://files.pythonhosted.org/packages/02/0a/96973bd325412feccf23cf3680fd2246aebf4b789122f938d5557c54a6b2/xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be", size = 428326, upload_time = "2024-08-17T09:18:29.335Z" }, + { url = "https://files.pythonhosted.org/packages/11/a7/81dba5010f7e733de88af9555725146fc133be97ce36533867f4c7e75066/xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6", size = 194380, upload_time = "2024-08-17T09:18:30.706Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7d/f29006ab398a173f4501c0e4977ba288f1c621d878ec217b4ff516810c04/xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90", size = 207934, upload_time = "2024-08-17T09:18:32.133Z" }, + { url = "https://files.pythonhosted.org/packages/8a/6e/6e88b8f24612510e73d4d70d9b0c7dff62a2e78451b9f0d042a5462c8d03/xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27", size = 216301, upload_time = "2024-08-17T09:18:33.474Z" }, + { url = "https://files.pythonhosted.org/packages/af/51/7862f4fa4b75a25c3b4163c8a873f070532fe5f2d3f9b3fc869c8337a398/xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2", size = 203351, upload_time = "2024-08-17T09:18:34.889Z" }, + { url = "https://files.pythonhosted.org/packages/22/61/8d6a40f288f791cf79ed5bb113159abf0c81d6efb86e734334f698eb4c59/xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d", size = 210294, upload_time = "2024-08-17T09:18:36.355Z" }, + { url = "https://files.pythonhosted.org/packages/17/02/215c4698955762d45a8158117190261b2dbefe9ae7e5b906768c09d8bc74/xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab", size = 414674, upload_time = "2024-08-17T09:18:38.536Z" }, + { url = "https://files.pythonhosted.org/packages/31/5c/b7a8db8a3237cff3d535261325d95de509f6a8ae439a5a7a4ffcff478189/xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e", size = 192022, upload_time = "2024-08-17T09:18:40.138Z" }, + { url = "https://files.pythonhosted.org/packages/78/e3/dd76659b2811b3fd06892a8beb850e1996b63e9235af5a86ea348f053e9e/xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8", size = 30170, upload_time = "2024-08-17T09:18:42.163Z" }, + { url = "https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e", size = 30040, upload_time = "2024-08-17T09:18:43.699Z" }, + { url = "https://files.pythonhosted.org/packages/0f/eb/04405305f290173acc0350eba6d2f1a794b57925df0398861a20fbafa415/xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2", size = 26796, upload_time = "2024-08-17T09:18:45.29Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b8/e4b3ad92d249be5c83fa72916c9091b0965cb0faeff05d9a0a3870ae6bff/xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6", size = 31795, upload_time = "2024-08-17T09:18:46.813Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d8/b3627a0aebfbfa4c12a41e22af3742cf08c8ea84f5cc3367b5de2d039cce/xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5", size = 30792, upload_time = "2024-08-17T09:18:47.862Z" }, + { url = "https://files.pythonhosted.org/packages/c3/cc/762312960691da989c7cd0545cb120ba2a4148741c6ba458aa723c00a3f8/xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc", size = 220950, upload_time = "2024-08-17T09:18:49.06Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e9/cc266f1042c3c13750e86a535496b58beb12bf8c50a915c336136f6168dc/xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3", size = 199980, upload_time = "2024-08-17T09:18:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/bf/85/a836cd0dc5cc20376de26b346858d0ac9656f8f730998ca4324921a010b9/xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c", size = 428324, upload_time = "2024-08-17T09:18:51.988Z" }, + { url = "https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb", size = 194370, upload_time = "2024-08-17T09:18:54.164Z" }, + { url = "https://files.pythonhosted.org/packages/87/a1/b028bb02636dfdc190da01951d0703b3d904301ed0ef6094d948983bef0e/xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f", size = 207911, upload_time = "2024-08-17T09:18:55.509Z" }, + { url = "https://files.pythonhosted.org/packages/80/d5/73c73b03fc0ac73dacf069fdf6036c9abad82de0a47549e9912c955ab449/xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7", size = 216352, upload_time = "2024-08-17T09:18:57.073Z" }, + { url = "https://files.pythonhosted.org/packages/b6/2a/5043dba5ddbe35b4fe6ea0a111280ad9c3d4ba477dd0f2d1fe1129bda9d0/xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326", size = 203410, upload_time = "2024-08-17T09:18:58.54Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b2/9a8ded888b7b190aed75b484eb5c853ddd48aa2896e7b59bbfbce442f0a1/xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf", size = 210322, upload_time = "2024-08-17T09:18:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/98/62/440083fafbc917bf3e4b67c2ade621920dd905517e85631c10aac955c1d2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7", size = 414725, upload_time = "2024-08-17T09:19:01.332Z" }, + { url = "https://files.pythonhosted.org/packages/75/db/009206f7076ad60a517e016bb0058381d96a007ce3f79fa91d3010f49cc2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c", size = 192070, upload_time = "2024-08-17T09:19:03.007Z" }, + { url = "https://files.pythonhosted.org/packages/1f/6d/c61e0668943a034abc3a569cdc5aeae37d686d9da7e39cf2ed621d533e36/xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637", size = 30172, upload_time = "2024-08-17T09:19:04.355Z" }, + { url = "https://files.pythonhosted.org/packages/96/14/8416dce965f35e3d24722cdf79361ae154fa23e2ab730e5323aa98d7919e/xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43", size = 30041, upload_time = "2024-08-17T09:19:05.435Z" }, + { url = "https://files.pythonhosted.org/packages/27/ee/518b72faa2073f5aa8e3262408d284892cb79cf2754ba0c3a5870645ef73/xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b", size = 26801, upload_time = "2024-08-17T09:19:06.547Z" }, +] + +[[package]] +name = "yarl" +version = "1.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/62/51/c0edba5219027f6eab262e139f73e2417b0f4efffa23bf562f6e18f76ca5/yarl-1.20.0.tar.gz", hash = "sha256:686d51e51ee5dfe62dec86e4866ee0e9ed66df700d55c828a615640adc885307", size = 185258, upload_time = "2025-04-17T00:45:14.661Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/82/a59d8e21b20ffc836775fa7daedac51d16bb8f3010c4fcb495c4496aa922/yarl-1.20.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fdb5204d17cb32b2de2d1e21c7461cabfacf17f3645e4b9039f210c5d3378bf3", size = 145178, upload_time = "2025-04-17T00:42:04.511Z" }, + { url = "https://files.pythonhosted.org/packages/ba/81/315a3f6f95947cfbf37c92d6fbce42a1a6207b6c38e8c2b452499ec7d449/yarl-1.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eaddd7804d8e77d67c28d154ae5fab203163bd0998769569861258e525039d2a", size = 96859, upload_time = "2025-04-17T00:42:06.43Z" }, + { url = "https://files.pythonhosted.org/packages/ad/17/9b64e575583158551b72272a1023cdbd65af54fe13421d856b2850a6ddb7/yarl-1.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:634b7ba6b4a85cf67e9df7c13a7fb2e44fa37b5d34501038d174a63eaac25ee2", size = 94647, upload_time = "2025-04-17T00:42:07.976Z" }, + { url = "https://files.pythonhosted.org/packages/2c/29/8f291e7922a58a21349683f6120a85701aeefaa02e9f7c8a2dc24fe3f431/yarl-1.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d409e321e4addf7d97ee84162538c7258e53792eb7c6defd0c33647d754172e", size = 355788, upload_time = "2025-04-17T00:42:09.902Z" }, + { url = "https://files.pythonhosted.org/packages/26/6d/b4892c80b805c42c228c6d11e03cafabf81662d371b0853e7f0f513837d5/yarl-1.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ea52f7328a36960ba3231c6677380fa67811b414798a6e071c7085c57b6d20a9", size = 344613, upload_time = "2025-04-17T00:42:11.768Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0e/517aa28d3f848589bae9593717b063a544b86ba0a807d943c70f48fcf3bb/yarl-1.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8703517b924463994c344dcdf99a2d5ce9eca2b6882bb640aa555fb5efc706a", size = 370953, upload_time = "2025-04-17T00:42:13.983Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9b/5bd09d2f1ad6e6f7c2beae9e50db78edd2cca4d194d227b958955573e240/yarl-1.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:077989b09ffd2f48fb2d8f6a86c5fef02f63ffe6b1dd4824c76de7bb01e4f2e2", size = 369204, upload_time = "2025-04-17T00:42:16.386Z" }, + { url = "https://files.pythonhosted.org/packages/9c/85/d793a703cf4bd0d4cd04e4b13cc3d44149470f790230430331a0c1f52df5/yarl-1.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0acfaf1da020253f3533526e8b7dd212838fdc4109959a2c53cafc6db611bff2", size = 358108, upload_time = "2025-04-17T00:42:18.622Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/b6c71e13549c1f6048fbc14ce8d930ac5fb8bafe4f1a252e621a24f3f1f9/yarl-1.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4230ac0b97ec5eeb91d96b324d66060a43fd0d2a9b603e3327ed65f084e41f8", size = 346610, upload_time = "2025-04-17T00:42:20.9Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1a/d6087d58bdd0d8a2a37bbcdffac9d9721af6ebe50d85304d9f9b57dfd862/yarl-1.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a6a1e6ae21cdd84011c24c78d7a126425148b24d437b5702328e4ba640a8902", size = 365378, upload_time = "2025-04-17T00:42:22.926Z" }, + { url = "https://files.pythonhosted.org/packages/02/84/e25ddff4cbc001dbc4af76f8d41a3e23818212dd1f0a52044cbc60568872/yarl-1.20.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:86de313371ec04dd2531f30bc41a5a1a96f25a02823558ee0f2af0beaa7ca791", size = 356919, upload_time = "2025-04-17T00:42:25.145Z" }, + { url = "https://files.pythonhosted.org/packages/04/76/898ae362353bf8f64636495d222c8014c8e5267df39b1a9fe1e1572fb7d0/yarl-1.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:dd59c9dd58ae16eaa0f48c3d0cbe6be8ab4dc7247c3ff7db678edecbaf59327f", size = 364248, upload_time = "2025-04-17T00:42:27.475Z" }, + { url = "https://files.pythonhosted.org/packages/1b/b0/9d9198d83a622f1c40fdbf7bd13b224a6979f2e1fc2cf50bfb1d8773c495/yarl-1.20.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a0bc5e05f457b7c1994cc29e83b58f540b76234ba6b9648a4971ddc7f6aa52da", size = 378418, upload_time = "2025-04-17T00:42:29.333Z" }, + { url = "https://files.pythonhosted.org/packages/c7/ce/1f50c1cc594cf5d3f5bf4a9b616fca68680deaec8ad349d928445ac52eb8/yarl-1.20.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c9471ca18e6aeb0e03276b5e9b27b14a54c052d370a9c0c04a68cefbd1455eb4", size = 383850, upload_time = "2025-04-17T00:42:31.668Z" }, + { url = "https://files.pythonhosted.org/packages/89/1e/a59253a87b35bfec1a25bb5801fb69943330b67cfd266278eb07e0609012/yarl-1.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:40ed574b4df723583a26c04b298b283ff171bcc387bc34c2683235e2487a65a5", size = 381218, upload_time = "2025-04-17T00:42:33.523Z" }, + { url = "https://files.pythonhosted.org/packages/85/b0/26f87df2b3044b0ef1a7cf66d321102bdca091db64c5ae853fcb2171c031/yarl-1.20.0-cp311-cp311-win32.whl", hash = "sha256:db243357c6c2bf3cd7e17080034ade668d54ce304d820c2a58514a4e51d0cfd6", size = 86606, upload_time = "2025-04-17T00:42:35.873Z" }, + { url = "https://files.pythonhosted.org/packages/33/46/ca335c2e1f90446a77640a45eeb1cd8f6934f2c6e4df7db0f0f36ef9f025/yarl-1.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:8c12cd754d9dbd14204c328915e23b0c361b88f3cffd124129955e60a4fbfcfb", size = 93374, upload_time = "2025-04-17T00:42:37.586Z" }, + { url = "https://files.pythonhosted.org/packages/c3/e8/3efdcb83073df978bb5b1a9cc0360ce596680e6c3fac01f2a994ccbb8939/yarl-1.20.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e06b9f6cdd772f9b665e5ba8161968e11e403774114420737f7884b5bd7bdf6f", size = 147089, upload_time = "2025-04-17T00:42:39.602Z" }, + { url = "https://files.pythonhosted.org/packages/60/c3/9e776e98ea350f76f94dd80b408eaa54e5092643dbf65fd9babcffb60509/yarl-1.20.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b9ae2fbe54d859b3ade40290f60fe40e7f969d83d482e84d2c31b9bff03e359e", size = 97706, upload_time = "2025-04-17T00:42:41.469Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/45cdfb64a3b855ce074ae607b9fc40bc82e7613b94e7612b030255c93a09/yarl-1.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d12b8945250d80c67688602c891237994d203d42427cb14e36d1a732eda480e", size = 95719, upload_time = "2025-04-17T00:42:43.666Z" }, + { url = "https://files.pythonhosted.org/packages/2d/4e/929633b249611eeed04e2f861a14ed001acca3ef9ec2a984a757b1515889/yarl-1.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:087e9731884621b162a3e06dc0d2d626e1542a617f65ba7cc7aeab279d55ad33", size = 343972, upload_time = "2025-04-17T00:42:45.391Z" }, + { url = "https://files.pythonhosted.org/packages/49/fd/047535d326c913f1a90407a3baf7ff535b10098611eaef2c527e32e81ca1/yarl-1.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:69df35468b66c1a6e6556248e6443ef0ec5f11a7a4428cf1f6281f1879220f58", size = 339639, upload_time = "2025-04-17T00:42:47.552Z" }, + { url = "https://files.pythonhosted.org/packages/48/2f/11566f1176a78f4bafb0937c0072410b1b0d3640b297944a6a7a556e1d0b/yarl-1.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b2992fe29002fd0d4cbaea9428b09af9b8686a9024c840b8a2b8f4ea4abc16f", size = 353745, upload_time = "2025-04-17T00:42:49.406Z" }, + { url = "https://files.pythonhosted.org/packages/26/17/07dfcf034d6ae8837b33988be66045dd52f878dfb1c4e8f80a7343f677be/yarl-1.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4c903e0b42aab48abfbac668b5a9d7b6938e721a6341751331bcd7553de2dcae", size = 354178, upload_time = "2025-04-17T00:42:51.588Z" }, + { url = "https://files.pythonhosted.org/packages/15/45/212604d3142d84b4065d5f8cab6582ed3d78e4cc250568ef2a36fe1cf0a5/yarl-1.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf099e2432131093cc611623e0b0bcc399b8cddd9a91eded8bfb50402ec35018", size = 349219, upload_time = "2025-04-17T00:42:53.674Z" }, + { url = "https://files.pythonhosted.org/packages/e6/e0/a10b30f294111c5f1c682461e9459935c17d467a760c21e1f7db400ff499/yarl-1.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7f62f5dc70a6c763bec9ebf922be52aa22863d9496a9a30124d65b489ea672", size = 337266, upload_time = "2025-04-17T00:42:55.49Z" }, + { url = "https://files.pythonhosted.org/packages/33/a6/6efa1d85a675d25a46a167f9f3e80104cde317dfdf7f53f112ae6b16a60a/yarl-1.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:54ac15a8b60382b2bcefd9a289ee26dc0920cf59b05368c9b2b72450751c6eb8", size = 360873, upload_time = "2025-04-17T00:42:57.895Z" }, + { url = "https://files.pythonhosted.org/packages/77/67/c8ab718cb98dfa2ae9ba0f97bf3cbb7d45d37f13fe1fbad25ac92940954e/yarl-1.20.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:25b3bc0763a7aca16a0f1b5e8ef0f23829df11fb539a1b70476dcab28bd83da7", size = 360524, upload_time = "2025-04-17T00:43:00.094Z" }, + { url = "https://files.pythonhosted.org/packages/bd/e8/c3f18660cea1bc73d9f8a2b3ef423def8dadbbae6c4afabdb920b73e0ead/yarl-1.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b2586e36dc070fc8fad6270f93242124df68b379c3a251af534030a4a33ef594", size = 365370, upload_time = "2025-04-17T00:43:02.242Z" }, + { url = "https://files.pythonhosted.org/packages/c9/99/33f3b97b065e62ff2d52817155a89cfa030a1a9b43fee7843ef560ad9603/yarl-1.20.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:866349da9d8c5290cfefb7fcc47721e94de3f315433613e01b435473be63daa6", size = 373297, upload_time = "2025-04-17T00:43:04.189Z" }, + { url = "https://files.pythonhosted.org/packages/3d/89/7519e79e264a5f08653d2446b26d4724b01198a93a74d2e259291d538ab1/yarl-1.20.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:33bb660b390a0554d41f8ebec5cd4475502d84104b27e9b42f5321c5192bfcd1", size = 378771, upload_time = "2025-04-17T00:43:06.609Z" }, + { url = "https://files.pythonhosted.org/packages/3a/58/6c460bbb884abd2917c3eef6f663a4a873f8dc6f498561fc0ad92231c113/yarl-1.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:737e9f171e5a07031cbee5e9180f6ce21a6c599b9d4b2c24d35df20a52fabf4b", size = 375000, upload_time = "2025-04-17T00:43:09.01Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2a/dd7ed1aa23fea996834278d7ff178f215b24324ee527df53d45e34d21d28/yarl-1.20.0-cp312-cp312-win32.whl", hash = "sha256:839de4c574169b6598d47ad61534e6981979ca2c820ccb77bf70f4311dd2cc64", size = 86355, upload_time = "2025-04-17T00:43:11.311Z" }, + { url = "https://files.pythonhosted.org/packages/ca/c6/333fe0338305c0ac1c16d5aa7cc4841208d3252bbe62172e0051006b5445/yarl-1.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:3d7dbbe44b443b0c4aa0971cb07dcb2c2060e4a9bf8d1301140a33a93c98e18c", size = 92904, upload_time = "2025-04-17T00:43:13.087Z" }, + { url = "https://files.pythonhosted.org/packages/0f/6f/514c9bff2900c22a4f10e06297714dbaf98707143b37ff0bcba65a956221/yarl-1.20.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2137810a20b933b1b1b7e5cf06a64c3ed3b4747b0e5d79c9447c00db0e2f752f", size = 145030, upload_time = "2025-04-17T00:43:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/4e/9d/f88da3fa319b8c9c813389bfb3463e8d777c62654c7168e580a13fadff05/yarl-1.20.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:447c5eadd750db8389804030d15f43d30435ed47af1313303ed82a62388176d3", size = 96894, upload_time = "2025-04-17T00:43:17.372Z" }, + { url = "https://files.pythonhosted.org/packages/cd/57/92e83538580a6968b2451d6c89c5579938a7309d4785748e8ad42ddafdce/yarl-1.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42fbe577272c203528d402eec8bf4b2d14fd49ecfec92272334270b850e9cd7d", size = 94457, upload_time = "2025-04-17T00:43:19.431Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ee/7ee43bd4cf82dddd5da97fcaddb6fa541ab81f3ed564c42f146c83ae17ce/yarl-1.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18e321617de4ab170226cd15006a565d0fa0d908f11f724a2c9142d6b2812ab0", size = 343070, upload_time = "2025-04-17T00:43:21.426Z" }, + { url = "https://files.pythonhosted.org/packages/4a/12/b5eccd1109e2097bcc494ba7dc5de156e41cf8309fab437ebb7c2b296ce3/yarl-1.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4345f58719825bba29895011e8e3b545e6e00257abb984f9f27fe923afca2501", size = 337739, upload_time = "2025-04-17T00:43:23.634Z" }, + { url = "https://files.pythonhosted.org/packages/7d/6b/0eade8e49af9fc2585552f63c76fa59ef469c724cc05b29519b19aa3a6d5/yarl-1.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d9b980d7234614bc4674468ab173ed77d678349c860c3af83b1fffb6a837ddc", size = 351338, upload_time = "2025-04-17T00:43:25.695Z" }, + { url = "https://files.pythonhosted.org/packages/45/cb/aaaa75d30087b5183c7b8a07b4fb16ae0682dd149a1719b3a28f54061754/yarl-1.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af4baa8a445977831cbaa91a9a84cc09debb10bc8391f128da2f7bd070fc351d", size = 353636, upload_time = "2025-04-17T00:43:27.876Z" }, + { url = "https://files.pythonhosted.org/packages/98/9d/d9cb39ec68a91ba6e66fa86d97003f58570327d6713833edf7ad6ce9dde5/yarl-1.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123393db7420e71d6ce40d24885a9e65eb1edefc7a5228db2d62bcab3386a5c0", size = 348061, upload_time = "2025-04-17T00:43:29.788Z" }, + { url = "https://files.pythonhosted.org/packages/72/6b/103940aae893d0cc770b4c36ce80e2ed86fcb863d48ea80a752b8bda9303/yarl-1.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab47acc9332f3de1b39e9b702d9c916af7f02656b2a86a474d9db4e53ef8fd7a", size = 334150, upload_time = "2025-04-17T00:43:31.742Z" }, + { url = "https://files.pythonhosted.org/packages/ef/b2/986bd82aa222c3e6b211a69c9081ba46484cffa9fab2a5235e8d18ca7a27/yarl-1.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4a34c52ed158f89876cba9c600b2c964dfc1ca52ba7b3ab6deb722d1d8be6df2", size = 362207, upload_time = "2025-04-17T00:43:34.099Z" }, + { url = "https://files.pythonhosted.org/packages/14/7c/63f5922437b873795d9422cbe7eb2509d4b540c37ae5548a4bb68fd2c546/yarl-1.20.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:04d8cfb12714158abf2618f792c77bc5c3d8c5f37353e79509608be4f18705c9", size = 361277, upload_time = "2025-04-17T00:43:36.202Z" }, + { url = "https://files.pythonhosted.org/packages/81/83/450938cccf732466953406570bdb42c62b5ffb0ac7ac75a1f267773ab5c8/yarl-1.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7dc63ad0d541c38b6ae2255aaa794434293964677d5c1ec5d0116b0e308031f5", size = 364990, upload_time = "2025-04-17T00:43:38.551Z" }, + { url = "https://files.pythonhosted.org/packages/b4/de/af47d3a47e4a833693b9ec8e87debb20f09d9fdc9139b207b09a3e6cbd5a/yarl-1.20.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d02b591a64e4e6ca18c5e3d925f11b559c763b950184a64cf47d74d7e41877", size = 374684, upload_time = "2025-04-17T00:43:40.481Z" }, + { url = "https://files.pythonhosted.org/packages/62/0b/078bcc2d539f1faffdc7d32cb29a2d7caa65f1a6f7e40795d8485db21851/yarl-1.20.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:95fc9876f917cac7f757df80a5dda9de59d423568460fe75d128c813b9af558e", size = 382599, upload_time = "2025-04-17T00:43:42.463Z" }, + { url = "https://files.pythonhosted.org/packages/74/a9/4fdb1a7899f1fb47fd1371e7ba9e94bff73439ce87099d5dd26d285fffe0/yarl-1.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bb769ae5760cd1c6a712135ee7915f9d43f11d9ef769cb3f75a23e398a92d384", size = 378573, upload_time = "2025-04-17T00:43:44.797Z" }, + { url = "https://files.pythonhosted.org/packages/fd/be/29f5156b7a319e4d2e5b51ce622b4dfb3aa8d8204cd2a8a339340fbfad40/yarl-1.20.0-cp313-cp313-win32.whl", hash = "sha256:70e0c580a0292c7414a1cead1e076c9786f685c1fc4757573d2967689b370e62", size = 86051, upload_time = "2025-04-17T00:43:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/52/56/05fa52c32c301da77ec0b5f63d2d9605946fe29defacb2a7ebd473c23b81/yarl-1.20.0-cp313-cp313-win_amd64.whl", hash = "sha256:4c43030e4b0af775a85be1fa0433119b1565673266a70bf87ef68a9d5ba3174c", size = 92742, upload_time = "2025-04-17T00:43:49.193Z" }, + { url = "https://files.pythonhosted.org/packages/d4/2f/422546794196519152fc2e2f475f0e1d4d094a11995c81a465faf5673ffd/yarl-1.20.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b6c4c3d0d6a0ae9b281e492b1465c72de433b782e6b5001c8e7249e085b69051", size = 163575, upload_time = "2025-04-17T00:43:51.533Z" }, + { url = "https://files.pythonhosted.org/packages/90/fc/67c64ddab6c0b4a169d03c637fb2d2a212b536e1989dec8e7e2c92211b7f/yarl-1.20.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8681700f4e4df891eafa4f69a439a6e7d480d64e52bf460918f58e443bd3da7d", size = 106121, upload_time = "2025-04-17T00:43:53.506Z" }, + { url = "https://files.pythonhosted.org/packages/6d/00/29366b9eba7b6f6baed7d749f12add209b987c4cfbfa418404dbadc0f97c/yarl-1.20.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:84aeb556cb06c00652dbf87c17838eb6d92cfd317799a8092cee0e570ee11229", size = 103815, upload_time = "2025-04-17T00:43:55.41Z" }, + { url = "https://files.pythonhosted.org/packages/28/f4/a2a4c967c8323c03689383dff73396281ced3b35d0ed140580825c826af7/yarl-1.20.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f166eafa78810ddb383e930d62e623d288fb04ec566d1b4790099ae0f31485f1", size = 408231, upload_time = "2025-04-17T00:43:57.825Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a1/66f7ffc0915877d726b70cc7a896ac30b6ac5d1d2760613603b022173635/yarl-1.20.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5d3d6d14754aefc7a458261027a562f024d4f6b8a798adb472277f675857b1eb", size = 390221, upload_time = "2025-04-17T00:44:00.526Z" }, + { url = "https://files.pythonhosted.org/packages/41/15/cc248f0504610283271615e85bf38bc014224122498c2016d13a3a1b8426/yarl-1.20.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a8f64df8ed5d04c51260dbae3cc82e5649834eebea9eadfd829837b8093eb00", size = 411400, upload_time = "2025-04-17T00:44:02.853Z" }, + { url = "https://files.pythonhosted.org/packages/5c/af/f0823d7e092bfb97d24fce6c7269d67fcd1aefade97d0a8189c4452e4d5e/yarl-1.20.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d9949eaf05b4d30e93e4034a7790634bbb41b8be2d07edd26754f2e38e491de", size = 411714, upload_time = "2025-04-17T00:44:04.904Z" }, + { url = "https://files.pythonhosted.org/packages/83/70/be418329eae64b9f1b20ecdaac75d53aef098797d4c2299d82ae6f8e4663/yarl-1.20.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c366b254082d21cc4f08f522ac201d0d83a8b8447ab562732931d31d80eb2a5", size = 404279, upload_time = "2025-04-17T00:44:07.721Z" }, + { url = "https://files.pythonhosted.org/packages/19/f5/52e02f0075f65b4914eb890eea1ba97e6fd91dd821cc33a623aa707b2f67/yarl-1.20.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91bc450c80a2e9685b10e34e41aef3d44ddf99b3a498717938926d05ca493f6a", size = 384044, upload_time = "2025-04-17T00:44:09.708Z" }, + { url = "https://files.pythonhosted.org/packages/6a/36/b0fa25226b03d3f769c68d46170b3e92b00ab3853d73127273ba22474697/yarl-1.20.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9c2aa4387de4bc3a5fe158080757748d16567119bef215bec643716b4fbf53f9", size = 416236, upload_time = "2025-04-17T00:44:11.734Z" }, + { url = "https://files.pythonhosted.org/packages/cb/3a/54c828dd35f6831dfdd5a79e6c6b4302ae2c5feca24232a83cb75132b205/yarl-1.20.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:d2cbca6760a541189cf87ee54ff891e1d9ea6406079c66341008f7ef6ab61145", size = 402034, upload_time = "2025-04-17T00:44:13.975Z" }, + { url = "https://files.pythonhosted.org/packages/10/97/c7bf5fba488f7e049f9ad69c1b8fdfe3daa2e8916b3d321aa049e361a55a/yarl-1.20.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:798a5074e656f06b9fad1a162be5a32da45237ce19d07884d0b67a0aa9d5fdda", size = 407943, upload_time = "2025-04-17T00:44:16.052Z" }, + { url = "https://files.pythonhosted.org/packages/fd/a4/022d2555c1e8fcff08ad7f0f43e4df3aba34f135bff04dd35d5526ce54ab/yarl-1.20.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f106e75c454288472dbe615accef8248c686958c2e7dd3b8d8ee2669770d020f", size = 423058, upload_time = "2025-04-17T00:44:18.547Z" }, + { url = "https://files.pythonhosted.org/packages/4c/f6/0873a05563e5df29ccf35345a6ae0ac9e66588b41fdb7043a65848f03139/yarl-1.20.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:3b60a86551669c23dc5445010534d2c5d8a4e012163218fc9114e857c0586fdd", size = 423792, upload_time = "2025-04-17T00:44:20.639Z" }, + { url = "https://files.pythonhosted.org/packages/9e/35/43fbbd082708fa42e923f314c24f8277a28483d219e049552e5007a9aaca/yarl-1.20.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e429857e341d5e8e15806118e0294f8073ba9c4580637e59ab7b238afca836f", size = 422242, upload_time = "2025-04-17T00:44:22.851Z" }, + { url = "https://files.pythonhosted.org/packages/ed/f7/f0f2500cf0c469beb2050b522c7815c575811627e6d3eb9ec7550ddd0bfe/yarl-1.20.0-cp313-cp313t-win32.whl", hash = "sha256:65a4053580fe88a63e8e4056b427224cd01edfb5f951498bfefca4052f0ce0ac", size = 93816, upload_time = "2025-04-17T00:44:25.491Z" }, + { url = "https://files.pythonhosted.org/packages/3f/93/f73b61353b2a699d489e782c3f5998b59f974ec3156a2050a52dfd7e8946/yarl-1.20.0-cp313-cp313t-win_amd64.whl", hash = "sha256:53b2da3a6ca0a541c1ae799c349788d480e5144cac47dba0266c7cb6c76151fe", size = 101093, upload_time = "2025-04-17T00:44:27.418Z" }, + { url = "https://files.pythonhosted.org/packages/ea/1f/70c57b3d7278e94ed22d85e09685d3f0a38ebdd8c5c73b65ba4c0d0fe002/yarl-1.20.0-py3-none-any.whl", hash = "sha256:5d0fe6af927a47a230f31e6004621fd0959eaa915fc62acfafa67ff7229a3124", size = 46124, upload_time = "2025-04-17T00:45:12.199Z" }, +] From 6a9464a2c5510b956965181458fb5d6ab9e57ca2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 15:07:34 -0400 Subject: [PATCH 168/182] Convert to full uv/pyproject dependency management --- .github/workflows/test_app.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 5cff8696..340aa110 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -37,6 +37,9 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Install the project + run: uv sync --locked --all-extras --dev + - name: Run tests run: | pytest tests/test_automated From 2e00141ccd238691eed1ba2e1a28f051484200be Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 15:09:24 -0400 Subject: [PATCH 169/182] Convert to full uv/pyproject dependency management --- .github/workflows/test_app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index 340aa110..64bf664e 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -42,5 +42,5 @@ jobs: - name: Run tests run: | - pytest tests/test_automated - pytest tests/test_alembic + uv run pytest tests/test_automated + uv run pytest tests/test_alembic From 9d9ca7049d9ee606d1df48f99acba5b7e869c771 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 15:15:45 -0400 Subject: [PATCH 170/182] Convert to full uv/pyproject dependency management --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 58111591..70a54a83 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ WORKDIR /app COPY pyproject.toml uv.lock ./ # Install dependencies -RUN uv sync +RUN uv sync --no-dev # Must call from the root directory because uv does not add playwright to path RUN /app/.venv/bin/playwright install-deps chromium RUN /app/.venv/bin/playwright install chromium From 1faba4ebbfd6edc5e99151cc8666ebf4ef1505ee Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 7 May 2025 15:23:04 -0400 Subject: [PATCH 171/182] Convert to full uv/pyproject dependency management --- Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 70a54a83..42736a8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,10 +9,11 @@ WORKDIR /app COPY pyproject.toml uv.lock ./ # Install dependencies -RUN uv sync --no-dev +ENV UV_PROJECT_ENVIRONMENT="/usr/local/" +RUN uv sync --locked --no-dev # Must call from the root directory because uv does not add playwright to path -RUN /app/.venv/bin/playwright install-deps chromium -RUN /app/.venv/bin/playwright install chromium +RUN playwright install-deps chromium +RUN playwright install chromium # Copy project files From fb1acf283ee70773683463a3d73e870daf41bdd6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 9 May 2025 09:39:29 -0400 Subject: [PATCH 172/182] DRAFT --- ...e17c04_create_url_annotation_flags_view.py | 25 ++- ...007bbcce3_create_url_data_sources_table.py | 54 +---- api/routes/metrics.py | 29 +-- collector_db/AsyncDatabaseClient.py | 68 ++++-- collector_db/DatabaseClient.py | 2 + collector_db/StatementComposer.py | 42 +++- collector_db/models.py | 27 --- .../GetMetricsBatchesAggregatedResponseDTO.py | 5 +- .../GetMetricsBatchesBreakdownResponseDTO.py | 9 +- ...etricsURLsBreakdownSubmittedResponseDTO.py | 4 +- .../task_data_objects/SubmitApprovedURLTDO.py | 5 +- pyproject.toml | 2 + tests/conftest.py | 15 +- tests/helpers/DBDataCreator.py | 121 ++++++++++- tests/helpers/complex_test_data_functions.py | 1 + .../helpers/test_batch_creation_parameters.py | 71 ++++++ .../api/helpers/RequestValidator.py | 31 ++- .../integration/api/test_metrics.py | 202 +++++++++++++++++- .../tasks/test_submit_approved_url_task.py | 18 +- uv.lock | 60 ++++++ 20 files changed, 639 insertions(+), 152 deletions(-) create mode 100644 tests/helpers/test_batch_creation_parameters.py diff --git a/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py index 0da22094..09f8d825 100644 --- a/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py +++ b/alembic/versions/2025_05_06_0919-f25852e17c04_create_url_annotation_flags_view.py @@ -19,16 +19,17 @@ def upgrade() -> None: op.execute(""" - CREATE VIEW url_annotation_flags AS - SELECT - u.id as url_id, - CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, - CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, - CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, - CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, - CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, - CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, - CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed FROM urls u LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id @@ -36,7 +37,9 @@ def upgrade() -> None: LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id - LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id; + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.confirmed_url_agency cua on u.id = cua.url_id + ) """) diff --git a/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py b/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py index da896c1c..499de2e4 100644 --- a/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py +++ b/alembic/versions/2025_05_06_1115-6f2007bbcce3_create_url_data_sources_table.py @@ -46,8 +46,8 @@ def upgrade() -> None: nullable=False, server_default=sa.text('now()') ), - sa.UniqueConstraint('url_id', name='uq_url_id'), - sa.UniqueConstraint('data_source_id', name='uq_data_source_id') + sa.UniqueConstraint('url_id', name='uq_url_data_sources_url_id'), + sa.UniqueConstraint('data_source_id', name='uq_url_data_sources_data_source_id') ) # Migrate existing urls with a data source ID @@ -62,58 +62,8 @@ def upgrade() -> None: # Drop existing data source ID column from urls table op.drop_column('urls', 'data_source_id') - # Add trigger to ensure linked URL has status of submitted - op.execute(""" - CREATE FUNCTION check_url_is_submitted() RETURNS trigger AS $$ - BEGIN - IF EXISTS ( - SELECT 1 FROM urls WHERE id = NEW.url_id AND outcome != 'submitted' - ) THEN - RAISE EXCEPTION 'URL status is not submitted '; - END IF; - RETURN NEW; - END; - $$ LANGUAGE plpgsql; - """) - - op.execute(""" - CREATE TRIGGER check_url_is_submitted - BEFORE INSERT OR UPDATE ON url_data_sources - FOR EACH ROW - EXECUTE FUNCTION check_url_is_submitted(); - """) - - op.execute(""" - CREATE FUNCTION prevent_status_change_if_data_source_exists() RETURNS trigger AS $$ - BEGIN - IF OLD.outcome = 'submitted' AND NEW.outcome IS DISTINCT FROM OLD.status THEN - IF EXISTS ( - SELECT 1 FROM url_data_sources WHERE url_id = OLD.id - ) THEN - RAISE EXCEPTION 'Cannot change status from submitted: related child records exist.'; - END IF; - END IF; - RETURN NEW; - END; - $$ LANGUAGE plpgsql; - """) - - op.execute(""" - CREATE TRIGGER check_status_change - BEFORE UPDATE ON urls - FOR EACH ROW - EXECUTE FUNCTION prevent_status_change_if_data_source_exists(); - """) - def downgrade() -> None: - # Drop new trigger and function on URLS - op.execute(""" - DROP TRIGGER IF EXISTS check_url_is_submitted ON urls; - DROP FUNCTION IF EXISTS check_url_is_submitted; - DROP TRIGGER IF EXISTS check_status_change ON urls; - DROP FUNCTION IF EXISTS prevent_status_change_if_data_source_exists; - """) op.drop_table('url_data_sources') diff --git a/api/routes/metrics.py b/api/routes/metrics.py index ab548437..d81aa2e6 100644 --- a/api/routes/metrics.py +++ b/api/routes/metrics.py @@ -1,6 +1,7 @@ from fastapi import APIRouter -from fastapi.params import Query +from fastapi.params import Query, Depends +from api.dependencies import get_async_core from core.AsyncCore import AsyncCore from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO @@ -8,7 +9,7 @@ from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO -from security_manager.SecurityManager import AccessInfo +from security_manager.SecurityManager import AccessInfo, get_access_info metrics_router = APIRouter( prefix="/metrics", @@ -18,15 +19,15 @@ @metrics_router.get("/batches/aggregated") async def get_batches_aggregated_metrics( - core: AsyncCore, - access_info: AccessInfo + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) ) -> GetMetricsBatchesAggregatedResponseDTO: return await core.get_batches_aggregated_metrics() @metrics_router.get("/batches/breakdown") async def get_batches_breakdown_metrics( - core: AsyncCore, - access_info: AccessInfo, + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), page: int = Query( description="The page number", default=1 @@ -36,28 +37,28 @@ async def get_batches_breakdown_metrics( @metrics_router.get("/urls/aggregate") async def get_urls_aggregated_metrics( - core: AsyncCore, - access_info: AccessInfo + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) ) -> GetMetricsURLsAggregatedResponseDTO: return await core.get_urls_aggregated_metrics() @metrics_router.get("/urls/breakdown/submitted") async def get_urls_breakdown_submitted_metrics( - core: AsyncCore, - access_info: AccessInfo + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: return await core.get_urls_breakdown_submitted_metrics() @metrics_router.get("/urls/breakdown/pending") async def get_urls_breakdown_pending_metrics( - core: AsyncCore, - access_info: AccessInfo + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) ) -> GetMetricsURLsBreakdownPendingResponseDTO: return await core.get_urls_breakdown_pending_metrics() @metrics_router.get("/backlog") async def get_backlog_metrics( - core: AsyncCore, - access_info: AccessInfo + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) ) -> GetMetricsBacklogResponseDTO: return await core.get_backlog_metrics() \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index d30d4aeb..422d1d20 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -7,6 +7,7 @@ from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased +from sqlalchemy.sql.functions import coalesce from starlette import status from collector_db.ConfigManager import ConfigManager @@ -27,7 +28,7 @@ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \ - BacklogSnapshot, URLAnnotationFlag, URLDataSource + BacklogSnapshot, URLDataSource from collector_manager.enums import URLStatus, CollectorType from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -1489,14 +1490,24 @@ async def mark_urls_as_submitted(self, session: AsyncSession, infos: list[Submit for info in infos: url_id = info.url_id data_source_id = info.data_source_id + query = ( update(URL) .where(URL.id == url_id) .values( - data_source_id=data_source_id, outcome=URLStatus.SUBMITTED.value ) ) + + url_data_source_object = URLDataSource( + url_id=url_id, + data_source_id=data_source_id + ) + if info.submitted_at is not None: + url_data_source_object.created_at = info.submitted_at + session.add(url_data_source_object) + + await session.execute(query) @session_manager @@ -1843,9 +1854,11 @@ def url_column(status: URLStatus, label): Batch.strategy, url_column(URLStatus.PENDING, label="pending_count"), url_column(URLStatus.ERROR, label="error_count"), + url_column(URLStatus.VALIDATED, label="validated_count"), url_column(URLStatus.SUBMITTED, label="submitted_count"), url_column(URLStatus.REJECTED, label="rejected_count"), - ).join( + + ).outerjoin( Batch, Batch.id == URL.batch_id ).group_by( Batch.strategy @@ -1854,16 +1867,17 @@ def url_column(status: URLStatus, label): # Combine query = select( Batch.strategy, - batch_count_subquery.c.done_count, - batch_count_subquery.c.error_count, - url_count_subquery.c.pending_count, - url_count_subquery.c.error_count, - url_count_subquery.c.submitted_count, - url_count_subquery.c.rejected_count, + batch_count_subquery.c.done_count.label("batch_done_count"), + batch_count_subquery.c.error_count.label("batch_error_count"), + coalesce(url_count_subquery.c.pending_count, 0).label("pending_count"), + coalesce(url_count_subquery.c.error_count, 0).label("error_count"), + coalesce(url_count_subquery.c.submitted_count, 0).label("submitted_count"), + coalesce(url_count_subquery.c.rejected_count, 0).label("rejected_count"), + coalesce(url_count_subquery.c.validated_count, 0).label("validated_count") ).join( batch_count_subquery, Batch.strategy == batch_count_subquery.c.strategy - ).join( + ).outerjoin( url_count_subquery, Batch.strategy == url_count_subquery.c.strategy ) @@ -1872,10 +1886,13 @@ def url_column(status: URLStatus, label): d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} for result in results: d[CollectorType(result.strategy)] = GetMetricsBatchesAggregatedInnerResponseDTO( - count_successful_batches=result.done_count, - count_failed_batches=result.error_count, - count_urls=result.pending_count + result.submitted_count + result.rejected_count + result.error_count, + count_successful_batches=result.batch_done_count, + count_failed_batches=result.batch_error_count, + count_urls=result.pending_count + result.submitted_count + + result.rejected_count + result.error_count + + result.validated_count, count_urls_pending=result.pending_count, + count_urls_validated=result.validated_count, count_urls_submitted=result.submitted_count, count_urls_rejected=result.rejected_count, count_urls_errors=result.error_count @@ -1906,6 +1923,7 @@ async def get_batches_breakdown_metrics( main_query = select( Batch.strategy, Batch.id, + Batch.status, Batch.date_generated.label("created_at"), ) @@ -1925,6 +1943,7 @@ def url_column(status: URLStatus, label): url_column(URLStatus.SUBMITTED, label="count_submitted"), url_column(URLStatus.REJECTED, label="count_rejected"), url_column(URLStatus.ERROR, label="count_error"), + url_column(URLStatus.VALIDATED, label="count_validated"), ).group_by( URL.batch_id ).subquery("url_count") @@ -1933,12 +1952,14 @@ def url_column(status: URLStatus, label): main_query.c.strategy, main_query.c.id, main_query.c.created_at, - count_query.c.count_total, - count_query.c.count_pending, - count_query.c.count_submitted, - count_query.c.count_rejected, - count_query.c.count_error, - ).join( + main_query.c.status, + coalesce(count_query.c.count_total, 0).label("count_total"), + coalesce(count_query.c.count_pending, 0).label("count_pending"), + coalesce(count_query.c.count_submitted, 0).label("count_submitted"), + coalesce(count_query.c.count_rejected, 0).label("count_rejected"), + coalesce(count_query.c.count_error, 0).label("count_error"), + coalesce(count_query.c.count_validated, 0).label("count_validated"), + ).outerjoin( count_query, main_query.c.id == count_query.c.batch_id ).offset( @@ -1952,14 +1973,16 @@ def url_column(status: URLStatus, label): batches: list[GetMetricsBatchesBreakdownInnerResponseDTO] = [] for result in results: dto = GetMetricsBatchesBreakdownInnerResponseDTO( - batch_id=str(result.id), + batch_id=result.id, strategy=CollectorType(result.strategy), + status=BatchStatus(result.status), created_at=result.created_at, count_url_total=result.count_total, count_url_pending=result.count_pending, count_url_submitted=result.count_submitted, count_url_rejected=result.count_rejected, count_url_error=result.count_error, + count_url_validated=result.count_validated ) batches.append(dto) return GetMetricsBatchesBreakdownResponseDTO( @@ -1971,7 +1994,8 @@ async def get_urls_breakdown_submitted_metrics( self, session: AsyncSession ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: - # TODO: Wrong submitted at time: The created at does not indicate when it was submitted + # TODO: Wrong submitted at time: The created at does not indicate + # when it was submitted # Build the query @@ -2061,6 +2085,8 @@ async def get_urls_breakdown_pending_metrics( session: AsyncSession ) -> GetMetricsURLsBreakdownPendingResponseDTO: + + # TODO: Replace with CTE flags = URLAnnotationFlag url = URL diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 94320fbc..112e5689 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -72,6 +72,8 @@ def insert_batch(self, session, batch_info: BatchInfo) -> int: record_type_match_rate=batch_info.record_type_match_rate, record_category_match_rate=batch_info.record_category_match_rate, ) + if batch_info.date_generated is not None: + batch.date_generated = batch_info.date_generated session.add(batch) session.commit() session.refresh(batch) diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 42c77ef7..648e44f2 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,11 +1,12 @@ from typing import Any -from sqlalchemy import Select, select, exists, Table, func, Subquery, and_, not_, ColumnElement +from sqlalchemy import Select, select, exists, Table, func, Subquery, and_, not_, ColumnElement, case, literal from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus, TaskType from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch, \ - ConfirmedURLAgency, LinkTaskURL, Task, UserUrlAgencySuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion + ConfirmedURLAgency, LinkTaskURL, Task, UserUrlAgencySuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, \ + AutoRecordTypeSuggestion, AutoRelevantSuggestion, ReviewingUserURL from collector_manager.enums import URLStatus, CollectorType from core.enums import BatchStatus @@ -122,4 +123,39 @@ def count_distinct(field, label): @staticmethod def sum_distinct(field, label): - return func.sum(func.distinct(field)).label(label) \ No newline at end of file + return func.sum(func.distinct(field)).label(label) + + @staticmethod + def url_annotation_flags_query() -> Select: + stmt = ( + select( + URL.id.label("url_id"), + case((AutoRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_auto_record_type_suggestion" + ), + case((AutoRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_auto_relevant_suggestion" + ), + case((AutomatedUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_auto_agency_suggestion" + ), + case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_record_type_suggestion" + ), + case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_relevant_suggestion" + ), + case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_agency_suggestion" + ), + case((ReviewingUserURL.url_id != None, literal(True)), else_=literal(False)).label("was_reviewed"), + ) + .outerjoin(AutoRecordTypeSuggestion, URL.id == AutoRecordTypeSuggestion.url_id) + .outerjoin(AutoRelevantSuggestion, URL.id == AutoRelevantSuggestion.url_id) + .outerjoin(AutomatedUrlAgencySuggestion, URL.id == AutomatedUrlAgencySuggestion.url_id) + .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) + .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) + .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + .outerjoin(ReviewingUserURL, URL.id == ReviewingUserURL.url_id) + ) + return stmt \ No newline at end of file diff --git a/collector_db/models.py b/collector_db/models.py index 375e5203..b38243dd 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -136,10 +136,6 @@ class URL(Base): confirmed_agencies = relationship( "ConfirmedURLAgency", ) - annotation_flags = relationship( - "URLAnnotationFlag", - back_populates="url" - ) data_source = relationship( "URLDataSource", back_populates="url", @@ -469,29 +465,6 @@ class BacklogSnapshot(Base): count_pending_total = Column(Integer, nullable=False) created_at = get_created_at_column() -class URLAnnotationFlag(Base): - __tablename__ = "url_annotation_flags" - - url_id = Column( - Integer, - ForeignKey("urls.id"), - primary_key=True, - nullable=False - ) - has_auto_record_type_annotation = Column(Boolean, nullable=False) - has_auto_relevant_annotation = Column(Boolean, nullable=False) - has_auto_agency_annotation = Column(Boolean, nullable=False) - has_user_record_type_annotation = Column(Boolean, nullable=False) - has_user_relevant_annotation = Column(Boolean, nullable=False) - has_user_agency_annotation = Column(Boolean, nullable=False) - was_reviewed = Column(Boolean, nullable=False) - - # Relationships - url = relationship( - "URL", - back_populates="annotation_flags" - ) - class URLDataSource(Base): __tablename__ = "url_data_sources" diff --git a/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py b/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py index 565ab208..37535f2d 100644 --- a/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py +++ b/core/DTOs/GetMetricsBatchesAggregatedResponseDTO.py @@ -1,3 +1,5 @@ +from typing import Dict + from pydantic import BaseModel from collector_manager.enums import CollectorType @@ -8,6 +10,7 @@ class GetMetricsBatchesAggregatedInnerResponseDTO(BaseModel): count_failed_batches: int count_urls: int count_urls_pending: int + count_urls_validated: int count_urls_submitted: int count_urls_rejected: int count_urls_errors: int @@ -16,7 +19,7 @@ class GetMetricsBatchesAggregatedInnerResponseDTO(BaseModel): class GetMetricsBatchesAggregatedResponseDTO(BaseModel): total_batches: int - by_strategy: dict[ + by_strategy: Dict[ CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO ] \ No newline at end of file diff --git a/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py b/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py index 5797ab54..6572f49f 100644 --- a/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py +++ b/core/DTOs/GetMetricsBatchesBreakdownResponseDTO.py @@ -1,19 +1,22 @@ -import datetime +from datetime import datetime from pydantic import BaseModel from collector_manager.enums import CollectorType +from core.enums import BatchStatus class GetMetricsBatchesBreakdownInnerResponseDTO(BaseModel): - batch_id: str + batch_id: int strategy: CollectorType - created_at: datetime.datetime + status: BatchStatus + created_at: datetime count_url_total: int count_url_pending: int count_url_submitted: int count_url_rejected: int count_url_error: int + count_url_validated: int class GetMetricsBatchesBreakdownResponseDTO(BaseModel): batches: list[GetMetricsBatchesBreakdownInnerResponseDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py index 7e17effe..687c34c8 100644 --- a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py @@ -1,9 +1,9 @@ -from datetime import datetime +from datetime import date from pydantic import BaseModel class GetMetricsURLsBreakdownSubmittedInnerDTO(BaseModel): - week_of: datetime.date + week_of: date count_submitted: int class GetMetricsURLsBreakdownSubmittedResponseDTO(BaseModel): diff --git a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py index c5b002d0..be26d3a8 100644 --- a/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py +++ b/core/DTOs/task_data_objects/SubmitApprovedURLTDO.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from core.enums import RecordType - +from datetime import datetime class SubmitApprovedURLTDO(BaseModel): url_id: int @@ -22,4 +22,5 @@ class SubmitApprovedURLTDO(BaseModel): class SubmittedURLInfo(BaseModel): url_id: int data_source_id: Optional[int] - request_error: Optional[str] \ No newline at end of file + request_error: Optional[str] + submitted_at: Optional[datetime] = None \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index de0abfcd..5d2269c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "ckanapi~=4.8", "datasets~=2.19.1", "docker~=7.1.0", + "environs>=14.1.1", "fastapi[standard]~=0.115.6", "from-root~=1.3.0", "google-api-python-client>=2.156.0", @@ -43,6 +44,7 @@ dependencies = [ [dependency-groups] dev = [ "docker>=7.1.0", + "pendulum>=3.1.0", "pytest>=7.2.2", "pytest-asyncio~=0.25.2", "pytest-mock==3.12.0", diff --git a/tests/conftest.py b/tests/conftest.py index d7b1bce7..c8f4bd64 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,13 +62,24 @@ def setup_and_teardown(): try: runner.upgrade("head") except Exception as e: + print("Exception while upgrading: ", e) + print("Resetting schema") runner.reset_schema() runner.stamp("base") runner.upgrade("head") - live_connection.close() - engine.dispose() + yield + try: + runner.downgrade("base") + except Exception as e: + print("Exception while downgrading: ", e) + print("Resetting schema") + runner.reset_schema() + runner.stamp("base") + finally: + live_connection.close() + engine.dispose() @pytest.fixture def wipe_database(): diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 695a3c7a..2fad7b0f 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -1,4 +1,5 @@ import asyncio +from datetime import datetime from random import randint from typing import List, Optional @@ -11,15 +12,28 @@ from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType from collector_db.DTOs.URLInfo import URLInfo +from collector_db.DTOs.URLMapping import URLMapping from collector_db.DatabaseClient import DatabaseClient from collector_db.enums import TaskType from collector_manager.enums import CollectorType, URLStatus +from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmittedURLInfo from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO from core.enums import BatchStatus, SuggestionType, RecordType +from helpers.test_batch_creation_parameters import TestBatchCreationParameters, AnnotationInfo from tests.helpers.simple_test_data_functions import generate_test_urls +class URLCreationInfo(BaseModel): + url_mappings: list[URLMapping] + outcome: URLStatus + annotation_info: AnnotationInfo + +class BatchURLCreationInfoV2(BaseModel): + batch_id: int + url_creation_infos: dict[URLStatus, URLCreationInfo] + class BatchURLCreationInfo(BaseModel): batch_id: int url_ids: list[int] @@ -37,9 +51,10 @@ def __init__(self, db_client: Optional[DatabaseClient] = None): self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() def batch( - self, - strategy: CollectorType = CollectorType.EXAMPLE, - batch_status: BatchStatus = BatchStatus.IN_PROCESS + self, + strategy: CollectorType = CollectorType.EXAMPLE, + batch_status: BatchStatus = BatchStatus.IN_PROCESS, + created_at: Optional[datetime] = None ) -> int: return self.db_client.insert_batch( BatchInfo( @@ -47,7 +62,8 @@ def batch( status=batch_status, total_url_count=1, parameters={"test_key": "test_value"}, - user_id=1 + user_id=1, + date_generated=created_at ) ) @@ -57,6 +73,49 @@ async def task(self, url_ids: Optional[list[int]] = None) -> int: await self.adb_client.link_urls_to_task(task_id=task_id, url_ids=url_ids) return task_id + async def batch_v2( + self, + parameters: TestBatchCreationParameters + ) -> BatchURLCreationInfoV2: + batch_id = self.batch( + strategy=parameters.strategy, + batch_status=parameters.outcome, + created_at=parameters.created_at + ) + if parameters.outcome in (BatchStatus.ERROR, BatchStatus.ABORTED): + return BatchURLCreationInfoV2( + batch_id=batch_id, + url_creation_infos={} + ) + + d: dict[URLStatus, URLCreationInfo] = {} + for url_parameters in parameters.urls: + iui: InsertURLsInfo = self.urls( + batch_id=batch_id, + url_count=url_parameters.count, + outcome=url_parameters.status, + created_at=parameters.created_at + ) + url_ids = [iui.url_id for iui in iui.url_mappings] + if url_parameters.with_html_content: + await self.html_data(url_ids) + if url_parameters.annotation_info.has_annotations(): + for url_id in url_ids: + await self.annotate( + url_id=url_id, + annotation_info=url_parameters.annotation_info + ) + + d[url_parameters.status] = URLCreationInfo( + url_mappings=iui.url_mappings, + outcome=url_parameters.status, + annotation_info=url_parameters.annotation_info + ) + return BatchURLCreationInfoV2( + batch_id=batch_id, + url_creation_infos=d + ) + async def batch_and_urls( self, strategy: CollectorType = CollectorType.EXAMPLE, @@ -113,6 +172,41 @@ async def auto_relevant_suggestions(self, url_id: int, relevant: bool = True): relevant=relevant ) + async def annotate(self, url_id: int, annotation_info: AnnotationInfo): + info = annotation_info + if info.user_relevant is not None: + await self.user_relevant_suggestion(url_id=url_id, relevant=info.user_relevant) + if info.auto_relevant is not None: + await self.auto_relevant_suggestions(url_id=url_id, relevant=info.auto_relevant) + if info.user_record_type is not None: + await self.user_record_type_suggestion(url_id=url_id, record_type=info.user_record_type) + if info.auto_record_type is not None: + await self.auto_record_type_suggestions(url_id=url_id, record_type=info.auto_record_type) + if info.user_agency is not None: + await self.agency_user_suggestions(url_id=url_id, agency_id=info.user_agency) + if info.auto_agency is not None: + await self.agency_auto_suggestions(url_id=url_id, count=1, suggestion_type=SuggestionType.AUTO_SUGGESTION) + if info.confirmed_agency is not None: + await self.agency_auto_suggestions(url_id=url_id, count=1, suggestion_type=SuggestionType.CONFIRMED) + if info.final_review_approved is not None: + if info.final_review_approved: + final_review_approval_info = FinalReviewApprovalInfo( + url_id=url_id, + record_type=annotation_info.user_record_type, + agency_ids=[annotation_info.user_agency] if annotation_info.user_agency is not None else None, + description="Test Description", + ) + await self.adb_client.approve_url( + approval_info=final_review_approval_info, + user_id=1 + ) + else: + await self.adb_client.reject_url( + url_id=url_id, + user_id=1 + ) + + async def user_relevant_suggestion( self, url_id: int, @@ -204,7 +298,8 @@ def urls( batch_id: int, url_count: int, collector_metadata: Optional[dict] = None, - outcome: URLStatus = URLStatus.PENDING + outcome: URLStatus = URLStatus.PENDING, + created_at: Optional[datetime] = None ) -> InsertURLsInfo: raw_urls = generate_test_urls(url_count) url_infos: List[URLInfo] = [] @@ -214,10 +309,24 @@ def urls( url=url, outcome=outcome, name="Test Name" if outcome == URLStatus.VALIDATED else None, - collector_metadata=collector_metadata + collector_metadata=collector_metadata, + created_at=created_at ) ) + # If outcome is submitted, also add entry to DataSourceURL + if outcome == URLStatus.SUBMITTED: + submitted_url_infos = [] + for url_info in url_infos: + submitted_url_info = SubmittedURLInfo( + url_id=url_info.url_id, + data_source_id=url_info.url_id, # Use same ID for convenience, + request_error=None, + submitted_at=created_at + ) + asyncio.run(self.adb_client.mark_urls_as_submitted(submitted_url_infos)) + + return self.db_client.insert_urls( url_infos=url_infos, batch_id=batch_id, diff --git a/tests/helpers/complex_test_data_functions.py b/tests/helpers/complex_test_data_functions.py index 6f9ca7c3..bc03020f 100644 --- a/tests/helpers/complex_test_data_functions.py +++ b/tests/helpers/complex_test_data_functions.py @@ -121,3 +121,4 @@ async def add_relevant_suggestion(relevant: bool): url_mapping=url_mapping, user_agency_id=user_agency_id ) + diff --git a/tests/helpers/test_batch_creation_parameters.py b/tests/helpers/test_batch_creation_parameters.py new file mode 100644 index 00000000..ef8400b9 --- /dev/null +++ b/tests/helpers/test_batch_creation_parameters.py @@ -0,0 +1,71 @@ +import datetime +from typing import Optional + +from pydantic import BaseModel, model_validator + +from collector_manager.enums import URLStatus, CollectorType +from core.enums import BatchStatus, AnnotationType, RecordType + + +class AnnotationInfo(BaseModel): + user_relevant: Optional[bool] = None + auto_relevant: Optional[bool] = None + user_record_type: Optional[RecordType] = None + auto_record_type: Optional[RecordType] = None + user_agency: Optional[int] = None + auto_agency: Optional[int] = None + confirmed_agency: Optional[int] = None + final_review_approved: Optional[bool] = None + + def has_annotations(self): + return any([ + self.user_relevant, + self.auto_relevant, + self.user_record_type, + self.auto_record_type, + self.user_agency, + self.auto_agency, + self.confirmed_agency, + self.final_review_approved + ]) + +class TestURLCreationParameters(BaseModel): + count: int + status: URLStatus = URLStatus.PENDING + with_html_content: bool = False + annotation_info: AnnotationInfo = AnnotationInfo() + + @model_validator(mode='after') + def validate_annotation_info(self): + if self.status == URLStatus.REJECTED: + self.annotation_info.final_review_approved = False + return self + if self.status != URLStatus.VALIDATED: + return self + + # Assume is validated + self.annotation_info.final_review_approved = True + if self.annotation_info.user_record_type is None: + self.annotation_info.user_record_type = RecordType.ARREST_RECORDS + if self.annotation_info.user_agency is None: + self.annotation_info.user_agency = 1 + + + return self + +class TestBatchCreationParameters(BaseModel): + created_at: Optional[datetime.datetime] = None + outcome: BatchStatus = BatchStatus.READY_TO_LABEL + strategy: CollectorType = CollectorType.EXAMPLE + urls: Optional[list[TestURLCreationParameters]] = None + + @model_validator(mode='after') + def validate_urls(self): + if self.outcome != BatchStatus.READY_TO_LABEL: + if self.urls is not None: + raise ValueError('URLs cannot be provided if outcome is not READY_TO_LABEL') + return self + + if self.urls is None: + self.urls = [TestURLCreationParameters(count=1)] + return self \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 91d27729..a61679d5 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -16,6 +16,9 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse +from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO +from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO +from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -140,6 +143,19 @@ def post_v2( **kwargs ) + def get_v2( + self, + url: str, + params: Optional[dict] = None, + **kwargs + ) -> dict: + return self.open_v2( + method="GET", + url=url, + params=params, + **kwargs + ) + def put( self, @@ -393,4 +409,17 @@ async def search_url(self, url: str) -> SearchURLResponse: url=f"/search/url", params={"url": url} ) - return SearchURLResponse(**data) \ No newline at end of file + return SearchURLResponse(**data) + + async def get_batches_aggregated_metrics(self) -> GetMetricsBatchesAggregatedResponseDTO: + data = self.get_v2( + url="/metrics/batches/aggregated" + ) + return GetMetricsBatchesAggregatedResponseDTO(**data) + + async def get_batches_breakdown_metrics(self, page: int) -> GetMetricsBatchesBreakdownResponseDTO: + data = self.get_v2( + url="/metrics/batches/breakdown", + params={"page": page} + ) + return GetMetricsBatchesBreakdownResponseDTO(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_metrics.py b/tests/test_automated/integration/api/test_metrics.py index 44eff414..d2f4adc6 100644 --- a/tests/test_automated/integration/api/test_metrics.py +++ b/tests/test_automated/integration/api/test_metrics.py @@ -1,22 +1,220 @@ +import pendulum import pytest +from collector_manager.enums import URLStatus, CollectorType +from core.enums import BatchStatus +from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters + @pytest.mark.asyncio async def test_get_batches_aggregated_metrics(api_test_helper): + ath = api_test_helper # Create successful batches with URLs of different statuses + all_params = [] + for i in range(3): + params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + TestURLCreationParameters( + count=3, + status=URLStatus.REJECTED + ), + TestURLCreationParameters( + count=4, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=5, + status=URLStatus.VALIDATED + ) + ] + ) + all_params.append(params) + # Create failed batches + for i in range(2): + params = TestBatchCreationParameters( + outcome=BatchStatus.ERROR + ) + all_params.append(params) + + for params in all_params: + await ath.db_data_creator.batch_v2(params) + + dto = await ath.request_validator.get_batches_aggregated_metrics() + assert dto.total_batches == 5 + inner_dto_example = dto.by_strategy[CollectorType.EXAMPLE] + assert inner_dto_example.count_urls == 0 + assert inner_dto_example.count_successful_batches == 0 + assert inner_dto_example.count_failed_batches == 2 + assert inner_dto_example.count_urls_pending == 0 + assert inner_dto_example.count_urls_submitted == 0 + assert inner_dto_example.count_urls_rejected == 0 + assert inner_dto_example.count_urls_errors == 0 + assert inner_dto_example.count_urls_validated == 0 + + inner_dto_manual = dto.by_strategy[CollectorType.MANUAL] + assert inner_dto_manual.count_urls == 45 + assert inner_dto_manual.count_successful_batches == 3 + assert inner_dto_manual.count_failed_batches == 0 + assert inner_dto_manual.count_urls_pending == 3 + assert inner_dto_manual.count_urls_submitted == 6 + assert inner_dto_manual.count_urls_rejected == 9 + assert inner_dto_manual.count_urls_errors == 12 + assert inner_dto_manual.count_urls_validated == 15 - raise NotImplementedError @pytest.mark.asyncio async def test_get_batches_breakdown_metrics(api_test_helper): - raise NotImplementedError + # Create a different batch for each week, with different URLs + today = pendulum.today() + ath = api_test_helper + + batch_1_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + ] + ) + batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + batch_2_params = TestBatchCreationParameters( + strategy=CollectorType.EXAMPLE, + outcome=BatchStatus.ERROR, + created_at=today.subtract(weeks=1), + ) + batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + batch_3_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + created_at=today.subtract(weeks=2), + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.REJECTED + ), + TestURLCreationParameters( + count=4, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=5, + status=URLStatus.VALIDATED + ), + ] + ) + batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + + dto_1 = await ath.request_validator.get_batches_breakdown_metrics( + page=1 + ) + assert len(dto_1.batches) == 3 + dto_batch_1 = dto_1.batches[2] + assert dto_batch_1.batch_id == batch_1.batch_id + assert dto_batch_1.strategy == CollectorType.MANUAL + assert dto_batch_1.status == BatchStatus.READY_TO_LABEL + assert pendulum.instance(dto_batch_1.created_at) > today + assert dto_batch_1.count_url_total == 3 + assert dto_batch_1.count_url_pending == 1 + assert dto_batch_1.count_url_submitted == 2 + assert dto_batch_1.count_url_rejected == 0 + assert dto_batch_1.count_url_error == 0 + assert dto_batch_1.count_url_validated == 0 + + dto_batch_2 = dto_1.batches[1] + assert dto_batch_2.batch_id == batch_2.batch_id + assert dto_batch_2.status == BatchStatus.ERROR + assert dto_batch_2.strategy == CollectorType.EXAMPLE + assert pendulum.instance(dto_batch_2.created_at) == today.subtract(weeks=1) + assert dto_batch_2.count_url_total == 0 + assert dto_batch_2.count_url_submitted == 0 + assert dto_batch_2.count_url_pending == 0 + assert dto_batch_2.count_url_rejected == 0 + assert dto_batch_2.count_url_error == 0 + assert dto_batch_2.count_url_validated == 0 + + dto_batch_3 = dto_1.batches[0] + assert dto_batch_3.batch_id == batch_3.batch_id + assert dto_batch_3.status == BatchStatus.READY_TO_LABEL + assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER + assert pendulum.instance(dto_batch_3.created_at) == today.subtract(weeks=2) + assert dto_batch_3.count_url_total == 12 + assert dto_batch_3.count_url_pending == 0 + assert dto_batch_3.count_url_submitted == 0 + assert dto_batch_3.count_url_rejected == 3 + assert dto_batch_3.count_url_error == 4 + assert dto_batch_3.count_url_validated == 5 + + dto_2 = await ath.request_validator.get_batches_breakdown_metrics( + page=2 + ) + assert len(dto_2.batches) == 0 @pytest.mark.asyncio async def test_get_urls_breakdown_submitted_metrics(api_test_helper): # Create URLs with submitted status, broken down in different amounts by different weeks # And ensure the URLs are + today = pendulum.today() + ath = api_test_helper + + batch_1_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + ] + ) + batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + batch_2_params = TestBatchCreationParameters( + strategy=CollectorType.EXAMPLE, + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.REJECTED + ) + ], + created_at=today.subtract(weeks=1), + ) + batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + batch_3_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + created_at=today.subtract(weeks=1), + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.REJECTED + ), + TestURLCreationParameters( + count=4, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=5, + status=URLStatus.VALIDATED + ), + ] + ) + batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) @pytest.mark.asyncio async def test_get_urls_breakdown_pending_metrics(api_test_helper): diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index 2d3aa192..32dc765c 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -4,7 +4,7 @@ import pytest from collector_db.enums import TaskType -from collector_db.models import URL, URLErrorInfo +from collector_db.models import URL, URLErrorInfo, URLDataSource from collector_manager.enums import URLStatus from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome @@ -152,10 +152,18 @@ async def test_submit_approved_url_task( assert url_2.outcome == URLStatus.SUBMITTED.value assert url_3.outcome == URLStatus.ERROR.value - # Check URLs now have data source ids - assert url_1.data_source_id == 21 - assert url_2.data_source_id == 34 - assert url_3.data_source_id is None + # Get URL Data Source Links + url_data_sources = await db_data_creator.adb_client.get_all(URLDataSource) + assert len(url_data_sources) == 2 + + url_data_source_1 = url_data_sources[0] + url_data_source_2 = url_data_sources[1] + + assert url_data_source_1.url_id == url_1.id + assert url_data_source_1.data_source_id == 21 + + assert url_data_source_2.url_id == url_2.id + assert url_data_source_2.data_source_id == 34 # Check that errored URL has entry in url_error_info url_errors = await db_data_creator.adb_client.get_all(URLErrorInfo) diff --git a/uv.lock b/uv.lock index f2ea60ae..bb269479 100644 --- a/uv.lock +++ b/uv.lock @@ -348,6 +348,7 @@ dependencies = [ { name = "ckanapi" }, { name = "datasets" }, { name = "docker" }, + { name = "environs" }, { name = "fastapi", extra = ["standard"] }, { name = "from-root" }, { name = "google-api-python-client" }, @@ -379,6 +380,7 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "docker" }, + { name = "pendulum" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-mock" }, @@ -396,6 +398,7 @@ requires-dist = [ { name = "ckanapi", specifier = "~=4.8" }, { name = "datasets", specifier = "~=2.19.1" }, { name = "docker", specifier = "~=7.1.0" }, + { name = "environs", specifier = ">=14.1.1" }, { name = "fastapi", extras = ["standard"], specifier = "~=0.115.6" }, { name = "from-root", specifier = "~=1.3.0" }, { name = "google-api-python-client", specifier = ">=2.156.0" }, @@ -427,6 +430,7 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "docker", specifier = ">=7.1.0" }, + { name = "pendulum", specifier = ">=3.1.0" }, { name = "pytest", specifier = ">=7.2.2" }, { name = "pytest-asyncio", specifier = "~=0.25.2" }, { name = "pytest-mock", specifier = "==3.12.0" }, @@ -519,6 +523,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/ee/bf0adb559ad3c786f12bcbc9296b3f5675f529199bef03e2df281fa1fadb/email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631", size = 33521, upload_time = "2024-06-20T11:30:28.248Z" }, ] +[[package]] +name = "environs" +version = "14.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "marshmallow" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/31/d3/e82bdbb8cc332e751f67a3f668c5d134d57f983497d9f3a59a375b6e8fd8/environs-14.1.1.tar.gz", hash = "sha256:03db7ee2d50ec697b68814cd175a3a05a7c7954804e4e419ca8b570dc5a835cf", size = 32050, upload_time = "2025-02-10T20:24:26.437Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/1c/ab9752f02d32d981d647c05822be9ff93809be8953dacea2da2bec9a9de9/environs-14.1.1-py3-none-any.whl", hash = "sha256:45bc56f1d53bbc59d8dd69bba97377dd88ec28b8229d81cedbd455b21789445b", size = 15566, upload_time = "2025-02-10T20:24:22.116Z" }, +] + [[package]] name = "fastapi" version = "0.115.12" @@ -1441,6 +1458,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload_time = "2024-09-20T13:09:48.112Z" }, ] +[[package]] +name = "pendulum" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/7c/009c12b86c7cc6c403aec80f8a4308598dfc5995e5c523a5491faaa3952e/pendulum-3.1.0.tar.gz", hash = "sha256:66f96303560f41d097bee7d2dc98ffca716fbb3a832c4b3062034c2d45865015", size = 85930, upload_time = "2025-04-19T14:30:01.675Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5e/6e/d28d3c22e6708b819a94c05bd05a3dfaed5c685379e8b6dc4b34b473b942/pendulum-3.1.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:61a03d14f8c64d13b2f7d5859e4b4053c4a7d3b02339f6c71f3e4606bfd67423", size = 338596, upload_time = "2025-04-19T14:01:11.306Z" }, + { url = "https://files.pythonhosted.org/packages/e1/e6/43324d58021d463c2eeb6146b169d2c935f2f840f9e45ac2d500453d954c/pendulum-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e674ed2d158afa5c361e60f1f67872dc55b492a10cacdaa7fcd7b7da5f158f24", size = 325854, upload_time = "2025-04-19T14:01:13.156Z" }, + { url = "https://files.pythonhosted.org/packages/b0/a7/d2ae79b960bfdea94dab67e2f118697b08bc9e98eb6bd8d32c4d99240da3/pendulum-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c75377eb16e58bbe7e03ea89eeea49be6fc5de0934a4aef0e263f8b4fa71bc2", size = 344334, upload_time = "2025-04-19T14:01:15.151Z" }, + { url = "https://files.pythonhosted.org/packages/96/94/941f071212e23c29aae7def891fb636930c648386e059ce09ea0dcd43933/pendulum-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:656b8b0ce070f0f2e5e2668247d3c783c55336534aa1f13bd0969535878955e1", size = 382259, upload_time = "2025-04-19T14:01:16.924Z" }, + { url = "https://files.pythonhosted.org/packages/51/ad/a78a701656aec00d16fee636704445c23ca11617a0bfe7c3848d1caa5157/pendulum-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48962903e6c1afe1f13548cb6252666056086c107d59e3d64795c58c9298bc2e", size = 436361, upload_time = "2025-04-19T14:01:18.796Z" }, + { url = "https://files.pythonhosted.org/packages/da/93/83f59ccbf4435c29dca8c63a6560fcbe4783079a468a5f91d9f886fd21f0/pendulum-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d364ec3f8e65010fefd4b0aaf7be5eb97e5df761b107a06f5e743b7c3f52c311", size = 353653, upload_time = "2025-04-19T14:01:20.159Z" }, + { url = "https://files.pythonhosted.org/packages/6f/0f/42d6644ec6339b41066f594e52d286162aecd2e9735aaf994d7e00c9e09d/pendulum-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd52caffc2afb86612ec43bbeb226f204ea12ebff9f3d12f900a7d3097210fcc", size = 524567, upload_time = "2025-04-19T14:01:21.457Z" }, + { url = "https://files.pythonhosted.org/packages/de/45/d84d909202755ab9d3379e5481fdf70f53344ebefbd68d6f5803ddde98a6/pendulum-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d439fccaa35c91f686bd59d30604dab01e8b5c1d0dd66e81648c432fd3f8a539", size = 525571, upload_time = "2025-04-19T14:01:23.329Z" }, + { url = "https://files.pythonhosted.org/packages/0d/e0/4de160773ce3c2f7843c310db19dd919a0cd02cc1c0384866f63b18a6251/pendulum-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:43288773a86d9c5c0ddb645f88f615ff6bd12fd1410b34323662beccb18f3b49", size = 260259, upload_time = "2025-04-19T14:01:24.689Z" }, + { url = "https://files.pythonhosted.org/packages/c1/7f/ffa278f78112c6c6e5130a702042f52aab5c649ae2edf814df07810bbba5/pendulum-3.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:569ea5072ae0f11d625e03b36d865f8037b76e838a3b621f6967314193896a11", size = 253899, upload_time = "2025-04-19T14:01:26.442Z" }, + { url = "https://files.pythonhosted.org/packages/7a/d7/b1bfe15a742f2c2713acb1fdc7dc3594ff46ef9418ac6a96fcb12a6ba60b/pendulum-3.1.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4dfd53e7583ccae138be86d6c0a0b324c7547df2afcec1876943c4d481cf9608", size = 336209, upload_time = "2025-04-19T14:01:27.815Z" }, + { url = "https://files.pythonhosted.org/packages/eb/87/0392da0c603c828b926d9f7097fbdddaafc01388cb8a00888635d04758c3/pendulum-3.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6a6e06a28f3a7d696546347805536f6f38be458cb79de4f80754430696bea9e6", size = 323130, upload_time = "2025-04-19T14:01:29.336Z" }, + { url = "https://files.pythonhosted.org/packages/c0/61/95f1eec25796be6dddf71440ee16ec1fd0c573fc61a73bd1ef6daacd529a/pendulum-3.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e68d6a51880708084afd8958af42dc8c5e819a70a6c6ae903b1c4bfc61e0f25", size = 341509, upload_time = "2025-04-19T14:01:31.1Z" }, + { url = "https://files.pythonhosted.org/packages/b5/7b/eb0f5e6aa87d5e1b467a1611009dbdc92f0f72425ebf07669bfadd8885a6/pendulum-3.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e3f1e5da39a7ea7119efda1dd96b529748c1566f8a983412d0908455d606942", size = 378674, upload_time = "2025-04-19T14:01:32.974Z" }, + { url = "https://files.pythonhosted.org/packages/29/68/5a4c1b5de3e54e16cab21d2ec88f9cd3f18599e96cc90a441c0b0ab6b03f/pendulum-3.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9af1e5eeddb4ebbe1b1c9afb9fd8077d73416ade42dd61264b3f3b87742e0bb", size = 436133, upload_time = "2025-04-19T14:01:34.349Z" }, + { url = "https://files.pythonhosted.org/packages/87/5d/f7a1d693e5c0f789185117d5c1d5bee104f5b0d9fbf061d715fb61c840a8/pendulum-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20f74aa8029a42e327bfc150472e0e4d2358fa5d795f70460160ba81b94b6945", size = 351232, upload_time = "2025-04-19T14:01:35.669Z" }, + { url = "https://files.pythonhosted.org/packages/30/77/c97617eb31f1d0554edb073201a294019b9e0a9bd2f73c68e6d8d048cd6b/pendulum-3.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:cf6229e5ee70c2660148523f46c472e677654d0097bec010d6730f08312a4931", size = 521562, upload_time = "2025-04-19T14:01:37.05Z" }, + { url = "https://files.pythonhosted.org/packages/76/22/0d0ef3393303877e757b848ecef8a9a8c7627e17e7590af82d14633b2cd1/pendulum-3.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:350cabb23bf1aec7c7694b915d3030bff53a2ad4aeabc8c8c0d807c8194113d6", size = 523221, upload_time = "2025-04-19T14:01:38.444Z" }, + { url = "https://files.pythonhosted.org/packages/99/f3/aefb579aa3cebd6f2866b205fc7a60d33e9a696e9e629024752107dc3cf5/pendulum-3.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:42959341e843077c41d47420f28c3631de054abd64da83f9b956519b5c7a06a7", size = 260502, upload_time = "2025-04-19T14:01:39.814Z" }, + { url = "https://files.pythonhosted.org/packages/02/74/4332b5d6e34c63d4df8e8eab2249e74c05513b1477757463f7fdca99e9be/pendulum-3.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:006758e2125da2e624493324dfd5d7d1b02b0c44bc39358e18bf0f66d0767f5f", size = 253089, upload_time = "2025-04-19T14:01:41.171Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1f/af928ba4aa403dac9569f787adcf024005e7654433d71f7a84e608716837/pendulum-3.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:28658b0baf4b30eb31d096a375983cfed033e60c0a7bbe94fa23f06cd779b50b", size = 336209, upload_time = "2025-04-19T14:01:42.775Z" }, + { url = "https://files.pythonhosted.org/packages/b6/16/b010643007ba964c397da7fa622924423883c1bbff1a53f9d1022cd7f024/pendulum-3.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b114dcb99ce511cb8f5495c7b6f0056b2c3dba444ef1ea6e48030d7371bd531a", size = 323132, upload_time = "2025-04-19T14:01:44.577Z" }, + { url = "https://files.pythonhosted.org/packages/64/19/c3c47aeecb5d9bceb0e89faafd800d39809b696c5b7bba8ec8370ad5052c/pendulum-3.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2404a6a54c80252ea393291f0b7f35525a61abae3d795407f34e118a8f133a18", size = 341509, upload_time = "2025-04-19T14:01:46.084Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/c06921ff6b860ff7e62e70b8e5d4dc70e36f5abb66d168bd64d51760bc4e/pendulum-3.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d06999790d9ee9962a1627e469f98568bf7ad1085553fa3c30ed08b3944a14d7", size = 378674, upload_time = "2025-04-19T14:01:47.727Z" }, + { url = "https://files.pythonhosted.org/packages/62/0b/a43953b9eba11e82612b033ac5133f716f1b76b6108a65da6f408b3cc016/pendulum-3.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94751c52f6b7c306734d1044c2c6067a474237e1e5afa2f665d1fbcbbbcf24b3", size = 436133, upload_time = "2025-04-19T14:01:49.126Z" }, + { url = "https://files.pythonhosted.org/packages/eb/a0/ec3d70b3b96e23ae1d039f132af35e17704c22a8250d1887aaefea4d78a6/pendulum-3.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5553ac27be05e997ec26d7f004cf72788f4ce11fe60bb80dda604a64055b29d0", size = 351232, upload_time = "2025-04-19T14:01:50.575Z" }, + { url = "https://files.pythonhosted.org/packages/f4/97/aba23f1716b82f6951ba2b1c9178a2d107d1e66c102762a9bf19988547ea/pendulum-3.1.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f8dee234ca6142bf0514368d01a72945a44685aaa2fc4c14c98d09da9437b620", size = 521563, upload_time = "2025-04-19T14:01:51.9Z" }, + { url = "https://files.pythonhosted.org/packages/01/33/2c0d5216cc53d16db0c4b3d510f141ee0a540937f8675948541190fbd48b/pendulum-3.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7378084fe54faab4ee481897a00b710876f2e901ded6221671e827a253e643f2", size = 523221, upload_time = "2025-04-19T14:01:53.275Z" }, + { url = "https://files.pythonhosted.org/packages/51/89/8de955c339c31aeae77fd86d3225509b998c81875e9dba28cb88b8cbf4b3/pendulum-3.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:8539db7ae2c8da430ac2515079e288948c8ebf7eb1edd3e8281b5cdf433040d6", size = 260501, upload_time = "2025-04-19T14:01:54.749Z" }, + { url = "https://files.pythonhosted.org/packages/15/c3/226a3837363e94f8722461848feec18bfdd7d5172564d53aa3c3397ff01e/pendulum-3.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:1ce26a608e1f7387cd393fba2a129507c4900958d4f47b90757ec17656856571", size = 253087, upload_time = "2025-04-19T14:01:55.998Z" }, + { url = "https://files.pythonhosted.org/packages/6e/23/e98758924d1b3aac11a626268eabf7f3cf177e7837c28d47bf84c64532d0/pendulum-3.1.0-py3-none-any.whl", hash = "sha256:f9178c2a8e291758ade1e8dd6371b1d26d08371b4c7730a6e9a3ef8b16ebae0f", size = 111799, upload_time = "2025-04-19T14:02:34.739Z" }, +] + [[package]] name = "playwright" version = "1.49.1" From 1084d647a447083856080cedbd0f4e80e40cbd0d Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 9 May 2025 15:04:34 -0400 Subject: [PATCH 173/182] feat(app): Create metrics endpoints --- ..._add_manual_strategy_to_batch_strategy_.py | 6 + ...5e16e0738f_create_backlogsnapshot_table.py | 2 +- collector_db/AsyncDatabaseClient.py | 67 +++-- collector_db/DatabaseClient.py | 39 ++- collector_db/StatementComposer.py | 27 +- ...tMetricsURLsBreakdownPendingResponseDTO.py | 3 +- ...etricsURLsBreakdownSubmittedResponseDTO.py | 2 +- tests/helpers/DBDataCreator.py | 21 +- .../helpers/test_batch_creation_parameters.py | 2 +- .../api/helpers/RequestValidator.py | 29 +- .../integration/api/test_metrics.py | 255 +++++++++++++++++- 11 files changed, 391 insertions(+), 62 deletions(-) diff --git a/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py index c5af4d33..9ec86fee 100644 --- a/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py +++ b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py @@ -38,6 +38,12 @@ def upgrade() -> None: def downgrade() -> None: + # Delete all batches with manual strategy + op.execute(""" + DELETE FROM BATCHES + WHERE STRATEGY = 'manual' + """) + switch_enum_type( table_name="batches", column_name="strategy", diff --git a/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py b/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py index d6b118fb..4d2fe7c5 100644 --- a/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py +++ b/alembic/versions/2025_05_06_0816-e55e16e0738f_create_backlogsnapshot_table.py @@ -21,7 +21,7 @@ def upgrade() -> None: op.create_table( 'backlog_snapshot', - sa.Column('id', sa.Integer(), nullable=False), + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), sa.Column('count_pending_total', sa.Integer(), nullable=False), sa.Column('created_at', sa.DateTime(), nullable=False), ) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 422d1d20..e9438c5b 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -3,7 +3,8 @@ from typing import Optional, Type, Any, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete, insert +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete, insert, CTE +from sqlalchemy.dialects import postgresql from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute, aliased @@ -1994,9 +1995,6 @@ async def get_urls_breakdown_submitted_metrics( self, session: AsyncSession ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: - # TODO: Wrong submitted at time: The created at does not indicate - # when it was submitted - # Build the query week = func.date_trunc('week', URLDataSource.created_at) @@ -2040,7 +2038,7 @@ async def get_urls_aggregated_metrics( ).limit(1) oldest_pending_url = await session.execute(oldest_pending_url_query) - oldest_pending_url = oldest_pending_url.scalars().one_or_none() + oldest_pending_url = oldest_pending_url.one_or_none() if oldest_pending_url is None: oldest_pending_url_id = None oldest_pending_created_at = None @@ -2079,46 +2077,56 @@ def case_column(status: URLStatus, label): oldest_pending_url_created_at=oldest_pending_created_at, ) + def compile(self, statement): + compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) + return compiled_sql + @session_manager async def get_urls_breakdown_pending_metrics( self, session: AsyncSession ) -> GetMetricsURLsBreakdownPendingResponseDTO: + sc = StatementComposer + flags: CTE = sc.url_annotation_flags_query( + status=URLStatus.PENDING + ) - # TODO: Replace with CTE - flags = URLAnnotationFlag - url = URL - week = func.date_trunc('week', url.created_at) + week = func.date_trunc('week', URL.created_at) # Build the query query = ( select( week.label('week'), - func.count(url.id).label('count_total'), - func.count(case((flags.has_user_record_type_annotation == True, 1))).label('user_record_type_count'), - func.count(case((flags.has_user_relevant_annotation == True, 1))).label('user_relevant_count'), - func.count(case((flags.has_user_agency_annotation == True, 1))).label('user_agency_count'), - ) - .where(url.outcome == URLStatus.PENDING.value) - .join(flags.url) + func.count(URL.id).label('count_total'), + func.count(case( + (flags.c.has_user_record_type_annotation == True, 1)) + ).label('user_record_type_count'), + func.count(case( + (flags.c.has_user_relevant_annotation == True, 1)) + ).label('user_relevant_count'), + func.count(case( + (flags.c.has_user_agency_annotation == True, 1)) + ).label('user_agency_count'), + ) + .join(flags, flags.c.url_id == URL.id) .group_by(week) .order_by(week.asc()) ) # Execute the query and return the results results = await session.execute(query) - all_results = results.scalars().all() + all_results = results.all() final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] for result in all_results: dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( week_created_at=result.week, count_pending_total=result.count_total, - count_pending_relevant_user=result.auto_record_type_count, - count_pending_record_type_user=result.auto_relevant_count, - count_pending_agency_user=result.auto_agency_count, + count_pending_relevant_user=result.user_relevant_count, + count_pending_record_type_user=result.user_record_type_count, + count_pending_agency_user=result.user_agency_count, ) final_results.append(dto) return GetMetricsURLsBreakdownPendingResponseDTO( @@ -2175,7 +2183,8 @@ async def get_backlog_metrics( @session_manager async def populate_backlog_snapshot( self, - session: AsyncSession + session: AsyncSession, + dt: Optional[datetime] = None ): sc = StatementComposer # Get count of pending URLs @@ -2183,12 +2192,18 @@ async def populate_backlog_snapshot( sc.count_distinct(URL.id, label="count") ).where( URL.outcome == URLStatus.PENDING.value - ).subquery("pending_count") + ) + + raw_result = await session.execute(query) + count = raw_result.one()[0] # insert count into snapshot - await session.execute( - insert(BacklogSnapshot).values( - count=query.c.count - ) + snapshot = BacklogSnapshot( + count_pending_total=count ) + if dt is not None: + snapshot.created_at = dt + + session.add(snapshot) + diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 112e5689..8bd8105f 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -1,10 +1,10 @@ from functools import wraps from typing import Optional, List -from sqlalchemy import create_engine +from sqlalchemy import create_engine, update from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import sessionmaker, scoped_session +from sqlalchemy.orm import sessionmaker, scoped_session, Session from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.BatchInfo import BatchInfo @@ -13,10 +13,11 @@ from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.models import Base, Batch, URL, Log, Duplicate -from collector_manager.enums import CollectorType +from collector_db.models import Base, Batch, URL, Log, Duplicate, URLDataSource +from collector_manager.enums import CollectorType, URLStatus from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO +from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmittedURLInfo from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus @@ -111,6 +112,8 @@ def insert_url(self, session, url_info: URLInfo) -> int: outcome=url_info.outcome.value, name=url_info.name ) + if url_info.created_at is not None: + url_entry.created_at = url_info.created_at session.add(url_entry) session.commit() session.refresh(url_entry) @@ -166,6 +169,34 @@ def update_url(self, session, url_info: URLInfo): url = session.query(URL).filter_by(id=url_info.id).first() url.collector_metadata = url_info.collector_metadata + @session_manager + def mark_urls_as_submitted( + self, + session: Session, + infos: list[SubmittedURLInfo] + ): + for info in infos: + url_id = info.url_id + data_source_id = info.data_source_id + + query = ( + update(URL) + .where(URL.id == url_id) + .values( + outcome=URLStatus.SUBMITTED.value + ) + ) + + url_data_source_object = URLDataSource( + url_id=url_id, + data_source_id=data_source_id + ) + if info.submitted_at is not None: + url_data_source_object.created_at = info.submitted_at + session.add(url_data_source_object) + + session.execute(query) + if __name__ == "__main__": client = DatabaseClient() print("Database client initialized.") diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 648e44f2..23c817b1 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,6 +1,6 @@ -from typing import Any +from typing import Any, Optional -from sqlalchemy import Select, select, exists, Table, func, Subquery, and_, not_, ColumnElement, case, literal +from sqlalchemy import Select, select, exists, Table, func, Subquery, and_, not_, ColumnElement, case, literal, CTE from sqlalchemy.orm import aliased from collector_db.enums import URLMetadataAttributeType, ValidationStatus, TaskType @@ -126,27 +126,29 @@ def sum_distinct(field, label): return func.sum(func.distinct(field)).label(label) @staticmethod - def url_annotation_flags_query() -> Select: + def url_annotation_flags_query( + status: Optional[URLStatus] = None + ) -> CTE: stmt = ( select( URL.id.label("url_id"), case((AutoRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_record_type_suggestion" + "has_auto_record_type_annotation" ), case((AutoRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_relevant_suggestion" + "has_auto_relevant_annotation" ), case((AutomatedUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_agency_suggestion" + "has_auto_agency_annotation" ), case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_record_type_suggestion" + "has_user_record_type_annotation" ), case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_relevant_suggestion" + "has_user_relevant_annotation" ), case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_agency_suggestion" + "has_user_agency_annotation" ), case((ReviewingUserURL.url_id != None, literal(True)), else_=literal(False)).label("was_reviewed"), ) @@ -158,4 +160,9 @@ def url_annotation_flags_query() -> Select: .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) .outerjoin(ReviewingUserURL, URL.id == ReviewingUserURL.url_id) ) - return stmt \ No newline at end of file + if status is not None: + stmt = stmt.where( + URL.outcome == status.value + ) + + return stmt.cte("url_annotation_flags") \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py index 304555b0..22235e45 100644 --- a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py @@ -1,7 +1,8 @@ from pydantic import BaseModel +from datetime import datetime class GetMetricsURLsBreakdownPendingResponseInnerDTO(BaseModel): - week_created_at: str + week_created_at: datetime count_pending_total: int count_pending_relevant_user: int count_pending_record_type_user: int diff --git a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py index 687c34c8..d5c1dde5 100644 --- a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py @@ -7,4 +7,4 @@ class GetMetricsURLsBreakdownSubmittedInnerDTO(BaseModel): count_submitted: int class GetMetricsURLsBreakdownSubmittedResponseDTO(BaseModel): - entries: list \ No newline at end of file + entries: list[GetMetricsURLsBreakdownSubmittedInnerDTO] \ No newline at end of file diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 2fad7b0f..71338d84 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -21,7 +21,7 @@ from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmittedURLInfo from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO from core.enums import BatchStatus, SuggestionType, RecordType -from helpers.test_batch_creation_parameters import TestBatchCreationParameters, AnnotationInfo +from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, AnnotationInfo from tests.helpers.simple_test_data_functions import generate_test_urls @@ -314,23 +314,26 @@ def urls( ) ) + url_insert_info = self.db_client.insert_urls( + url_infos=url_infos, + batch_id=batch_id, + ) + # If outcome is submitted, also add entry to DataSourceURL if outcome == URLStatus.SUBMITTED: submitted_url_infos = [] - for url_info in url_infos: + for url_id in url_insert_info.url_ids: submitted_url_info = SubmittedURLInfo( - url_id=url_info.url_id, - data_source_id=url_info.url_id, # Use same ID for convenience, + url_id=url_id, + data_source_id=url_id, # Use same ID for convenience, request_error=None, submitted_at=created_at ) - asyncio.run(self.adb_client.mark_urls_as_submitted(submitted_url_infos)) + submitted_url_infos.append(submitted_url_info) + self.db_client.mark_urls_as_submitted(submitted_url_infos) - return self.db_client.insert_urls( - url_infos=url_infos, - batch_id=batch_id, - ) + return url_insert_info async def url_miscellaneous_metadata( self, diff --git a/tests/helpers/test_batch_creation_parameters.py b/tests/helpers/test_batch_creation_parameters.py index ef8400b9..cfb4805e 100644 --- a/tests/helpers/test_batch_creation_parameters.py +++ b/tests/helpers/test_batch_creation_parameters.py @@ -18,7 +18,7 @@ class AnnotationInfo(BaseModel): final_review_approved: Optional[bool] = None def has_annotations(self): - return any([ + return any(value is not None for value in [ self.user_relevant, self.auto_relevant, self.user_record_type, diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index a61679d5..9207305a 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -19,6 +19,9 @@ from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO from core.DTOs.GetMetricsBatchesBreakdownResponseDTO import GetMetricsBatchesBreakdownResponseDTO +from core.DTOs.GetMetricsURLsAggregatedResponseDTO import GetMetricsURLsAggregatedResponseDTO +from core.DTOs.GetMetricsURLsBreakdownPendingResponseDTO import GetMetricsURLsBreakdownPendingResponseDTO +from core.DTOs.GetMetricsURLsBreakdownSubmittedResponseDTO import GetMetricsURLsBreakdownSubmittedResponseDTO from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseOuterInfo from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseOuterInfo from core.DTOs.GetNextURLForAgencyAnnotationResponse import GetNextURLForAgencyAnnotationResponse, \ @@ -422,4 +425,28 @@ async def get_batches_breakdown_metrics(self, page: int) -> GetMetricsBatchesBre url="/metrics/batches/breakdown", params={"page": page} ) - return GetMetricsBatchesBreakdownResponseDTO(**data) \ No newline at end of file + return GetMetricsBatchesBreakdownResponseDTO(**data) + + async def get_urls_breakdown_submitted_metrics(self) -> GetMetricsURLsBreakdownSubmittedResponseDTO: + data = self.get_v2( + url="/metrics/urls/breakdown/submitted" + ) + return GetMetricsURLsBreakdownSubmittedResponseDTO(**data) + + async def get_urls_breakdown_pending_metrics(self) -> GetMetricsURLsBreakdownPendingResponseDTO: + data = self.get_v2( + url="/metrics/urls/breakdown/pending" + ) + return GetMetricsURLsBreakdownPendingResponseDTO(**data) + + async def get_backlog_metrics(self) -> GetMetricsBacklogResponseDTO: + data = self.get_v2( + url="/metrics/backlog" + ) + return GetMetricsBacklogResponseDTO(**data) + + async def get_urls_aggregated_metrics(self) -> GetMetricsURLsAggregatedResponseDTO: + data = self.get_v2( + url="/metrics/urls/aggregate", + ) + return GetMetricsURLsAggregatedResponseDTO(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_metrics.py b/tests/test_automated/integration/api/test_metrics.py index d2f4adc6..fc45ad0b 100644 --- a/tests/test_automated/integration/api/test_metrics.py +++ b/tests/test_automated/integration/api/test_metrics.py @@ -2,8 +2,9 @@ import pytest from collector_manager.enums import URLStatus, CollectorType -from core.enums import BatchStatus -from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters +from core.enums import BatchStatus, RecordType +from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters, \ + AnnotationInfo @pytest.mark.asyncio @@ -190,7 +191,7 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.REJECTED + status=URLStatus.SUBMITTED ) ], created_at=today.subtract(weeks=1), @@ -202,7 +203,7 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.REJECTED + status=URLStatus.SUBMITTED ), TestURLCreationParameters( count=4, @@ -216,24 +217,262 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): ) batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + dto = await ath.request_validator.get_urls_breakdown_submitted_metrics() + assert len(dto.entries) == 2 + + entry_1 = dto.entries[0] + assert entry_1.count_submitted == 6 + + entry_2 = dto.entries[1] + assert entry_2.count_submitted == 2 + + @pytest.mark.asyncio async def test_get_urls_breakdown_pending_metrics(api_test_helper): # Build URLs, broken down into three separate weeks, # with each week having a different number of pending URLs # with a different number of kinds of annotations per URLs + + today = pendulum.today() + ath = api_test_helper + + agency_id = await ath.db_data_creator.agency() # Additionally, add some URLs that are submitted, # validated, errored, and ensure they are not counted + batch_1_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=False + ) + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + ] + ) + batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + batch_2_params = TestBatchCreationParameters( + strategy=CollectorType.EXAMPLE, + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=True, + user_record_type=RecordType.CALLS_FOR_SERVICE + ) + ) + ], + created_at=today.subtract(weeks=1), + ) + batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + batch_3_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + created_at=today.subtract(weeks=1), + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.SUBMITTED + ), + TestURLCreationParameters( + count=4, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=5, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=True, + user_record_type=RecordType.INCARCERATION_RECORDS, + user_agency=agency_id + ) + ), + ] + ) + batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + + dto = await ath.request_validator.get_urls_breakdown_pending_metrics() + assert len(dto.entries) == 2 + + entry_1 = dto.entries[0] + assert entry_1.count_pending_total == 8 + assert entry_1.count_pending_relevant_user == 8 + assert entry_1.count_pending_record_type_user == 8 + assert entry_1.count_pending_agency_user == 5 + + entry_2 = dto.entries[1] + assert entry_2.count_pending_total == 1 + assert entry_2.count_pending_relevant_user == 1 + assert entry_2.count_pending_record_type_user == 0 + assert entry_2.count_pending_agency_user == 0 + +@pytest.mark.asyncio +async def test_get_urls_aggregate_metrics(api_test_helper): + ath = api_test_helper + today = pendulum.today() + + batch_0_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + created_at=today.subtract(days=1), + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING, + ), + ] + ) + batch_0 = await ath.db_data_creator.batch_v2(batch_0_params) + oldest_url_id = batch_0.url_creation_infos[URLStatus.PENDING].url_mappings[0].url_id + + + batch_1_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING, + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + ] + ) + batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + + batch_2_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + urls=[ + TestURLCreationParameters( + count=4, + status=URLStatus.PENDING, + ), + TestURLCreationParameters( + count=2, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=1, + status=URLStatus.VALIDATED + ), + TestURLCreationParameters( + count=5, + status=URLStatus.REJECTED + ), + ] + ) + batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + + dto = await ath.request_validator.get_urls_aggregated_metrics() + + assert dto.oldest_pending_url_id == oldest_url_id + assert dto.oldest_pending_url_created_at == today.subtract(days=1).in_timezone('UTC').naive() + assert dto.count_urls_pending == 6 + assert dto.count_urls_rejected == 5 + assert dto.count_urls_errors == 2 + assert dto.count_urls_validated == 1 + assert dto.count_urls_submitted == 2 + assert dto.count_urls_total == 16 - raise NotImplementedError @pytest.mark.asyncio async def test_get_backlog_metrics(api_test_helper): - # Populate the backlog table and test that backlog metrics returned on a weekly basis + today = pendulum.today() + + ath = api_test_helper + adb_client = ath.adb_client() + + # Populate the backlog table and test that backlog metrics returned on a weekly basis # Ensure that multiple days in each week are added to the backlog table, with different values - # Test that the count closest to the beginning of the week is returned for each week - raise NotImplementedError + batch_1_params = TestBatchCreationParameters( + strategy=CollectorType.MANUAL, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=False + ) + ), + TestURLCreationParameters( + count=2, + status=URLStatus.SUBMITTED + ), + ] + ) + batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + + await adb_client.populate_backlog_snapshot( + dt=today.subtract(weeks=3).naive() + ) + + await adb_client.populate_backlog_snapshot( + dt=today.subtract(weeks=2, days=3).naive() + ) + + batch_2_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + urls=[ + TestURLCreationParameters( + count=4, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=False + ) + ), + TestURLCreationParameters( + count=2, + status=URLStatus.ERROR + ), + ] + ) + batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + + await adb_client.populate_backlog_snapshot( + dt=today.subtract(weeks=2).naive() + ) + + await adb_client.populate_backlog_snapshot( + dt=today.subtract(weeks=1, days=4).naive() + ) + + batch_3_params = TestBatchCreationParameters( + strategy=CollectorType.AUTO_GOOGLER, + urls=[ + TestURLCreationParameters( + count=7, + status=URLStatus.PENDING, + annotation_info=AnnotationInfo( + user_relevant=False + ) + ), + TestURLCreationParameters( + count=5, + status=URLStatus.VALIDATED + ), + ] + ) + batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + + await adb_client.populate_backlog_snapshot( + dt=today.subtract(weeks=1).naive() + ) + + dto = await ath.request_validator.get_backlog_metrics() + + assert len(dto.entries) == 3 + + # Test that the count closest to the beginning of the week is returned for each week + assert dto.entries[0].count_pending_total == 1 + assert dto.entries[1].count_pending_total == 5 + assert dto.entries[2].count_pending_total == 12 \ No newline at end of file From 7de9c50a90532bf3db3c3a44bca32db29a3f451c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 11 May 2025 10:54:53 -0400 Subject: [PATCH 174/182] DRAFT --- ...ebe_set_default_created_at_for_backlog_.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py diff --git a/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py new file mode 100644 index 00000000..76431bb3 --- /dev/null +++ b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py @@ -0,0 +1,38 @@ +"""Set default created_at for backlog_snapshot + +Revision ID: 9d4002437ebe +Revises: 6f2007bbcce3 +Create Date: 2025-05-11 10:54:22.797147 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '9d4002437ebe' +down_revision: Union[str, None] = '6f2007bbcce3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + table_name='backlog_snapshots', + column_name='created_at', + existing_type=sa.DateTime(), + nullable=False, + server_default=sa.text('now()') + ) + + +def downgrade() -> None: + op.alter_column( + table_name='backlog_snapshots', + column_name='created_at', + existing_type=sa.DateTime(), + nullable=False, + server_default=None + ) From c61d9149e8d03417f340ebdb5001a94bdfeade96 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 11 May 2025 16:23:43 -0400 Subject: [PATCH 175/182] DRAFT --- ..._11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py index 76431bb3..f45fee4b 100644 --- a/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py +++ b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py @@ -20,7 +20,7 @@ def upgrade() -> None: op.alter_column( - table_name='backlog_snapshots', + table_name='backlog_snapshot', column_name='created_at', existing_type=sa.DateTime(), nullable=False, From 1281e408f3f94325227cffd6a47cde08bd322a84 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 12 May 2025 16:48:50 -0400 Subject: [PATCH 176/182] Fix bug in `get_urls_breakdown_pending_metrics` --- collector_db/AsyncDatabaseClient.py | 25 +++++++++++++---- collector_db/StatementComposer.py | 42 ----------------------------- 2 files changed, 20 insertions(+), 47 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index e9438c5b..de0bd36a 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -3,7 +3,7 @@ from typing import Optional, Type, Any, List from fastapi import HTTPException -from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete, insert, CTE +from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, update, asc, delete, insert, CTE, literal from sqlalchemy.dialects import postgresql from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker @@ -2088,9 +2088,23 @@ async def get_urls_breakdown_pending_metrics( ) -> GetMetricsURLsBreakdownPendingResponseDTO: sc = StatementComposer - flags: CTE = sc.url_annotation_flags_query( - status=URLStatus.PENDING - ) + flags = ( + select( + URL.id.label("url_id"), + case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_record_type_annotation" + ), + case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_relevant_annotation" + ), + case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_agency_annotation" + ), + ) + .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) + .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) + .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + ).cte("flags") week = func.date_trunc('week', URL.created_at) @@ -2110,7 +2124,8 @@ async def get_urls_breakdown_pending_metrics( (flags.c.has_user_agency_annotation == True, 1)) ).label('user_agency_count'), ) - .join(flags, flags.c.url_id == URL.id) + .outerjoin(flags, flags.c.url_id == URL.id) + .where(URL.outcome == URLStatus.PENDING.value) .group_by(week) .order_by(week.asc()) ) diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index 23c817b1..2ea33c5f 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -124,45 +124,3 @@ def count_distinct(field, label): @staticmethod def sum_distinct(field, label): return func.sum(func.distinct(field)).label(label) - - @staticmethod - def url_annotation_flags_query( - status: Optional[URLStatus] = None - ) -> CTE: - stmt = ( - select( - URL.id.label("url_id"), - case((AutoRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_record_type_annotation" - ), - case((AutoRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_relevant_annotation" - ), - case((AutomatedUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_auto_agency_annotation" - ), - case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_record_type_annotation" - ), - case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_relevant_annotation" - ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_agency_annotation" - ), - case((ReviewingUserURL.url_id != None, literal(True)), else_=literal(False)).label("was_reviewed"), - ) - .outerjoin(AutoRecordTypeSuggestion, URL.id == AutoRecordTypeSuggestion.url_id) - .outerjoin(AutoRelevantSuggestion, URL.id == AutoRelevantSuggestion.url_id) - .outerjoin(AutomatedUrlAgencySuggestion, URL.id == AutomatedUrlAgencySuggestion.url_id) - .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) - .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) - .outerjoin(ReviewingUserURL, URL.id == ReviewingUserURL.url_id) - ) - if status is not None: - stmt = stmt.where( - URL.outcome == status.value - ) - - return stmt.cte("url_annotation_flags") \ No newline at end of file From 04e455160265ead326231c5ea7210a4c169bbd5d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 12 May 2025 18:33:18 -0400 Subject: [PATCH 177/182] fix(app): Address bug in agency identification Previously, agency identification was erroneously pulling up URLs that had an error status. This has been addressed. --- ...ebe_set_default_created_at_for_backlog_.py | 2 +- collector_db/AsyncDatabaseClient.py | 3 +++ core/TaskManager.py | 9 +++---- hugging_face/HuggingFaceInterface.py | 8 +++++- hugging_face/relevancy_worker.py | 8 ++++++ local_database/classes/DockerContainer.py | 6 +++++ tests/helpers/DBDataCreator.py | 4 +-- .../tasks/test_agency_preannotation_task.py | 26 ++++++++++++++++--- 8 files changed, 53 insertions(+), 13 deletions(-) diff --git a/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py index f45fee4b..fbdb5645 100644 --- a/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py +++ b/alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py @@ -30,7 +30,7 @@ def upgrade() -> None: def downgrade() -> None: op.alter_column( - table_name='backlog_snapshots', + table_name='backlog_snapshot', column_name='created_at', existing_type=sa.DateTime(), nullable=False, diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index de0bd36a..5d28f70f 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -776,6 +776,8 @@ async def has_urls_without_agency_suggestions( statement = ( select( URL.id + ).where( + URL.outcome == URLStatus.PENDING.value ) ) @@ -797,6 +799,7 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li statement = ( select(URL.id, URL.collector_metadata, Batch.strategy) + .where(URL.outcome == URLStatus.PENDING.value) .join(Batch) ) statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement) diff --git a/core/TaskManager.py b/core/TaskManager.py index 4761a62b..052bdbc8 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -101,7 +101,7 @@ async def get_task_operators(self) -> list[TaskOperatorBase]: await self.get_url_html_task_operator(), # await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), - # await self.get_agency_identification_task_operator(), + await self.get_agency_identification_task_operator(), await self.get_url_miscellaneous_metadata_task_operator(), await self.get_submit_approved_url_task_operator() ] @@ -122,10 +122,9 @@ async def run_tasks(self): while meets_prereq: print(f"Running {operator.task_type.value} Task") if count > TASK_REPEAT_THRESHOLD: - self.discord_poster.post_to_discord( - message=f"Task {operator.task_type.value} has been run" - f" more than {TASK_REPEAT_THRESHOLD} times in a row. " - f"Task loop terminated.") + message = f"Task {operator.task_type.value} has been run more than {TASK_REPEAT_THRESHOLD} times in a row. Task loop terminated." + print(message) + self.discord_poster.post_to_discord(message=message) break task_id = await self.initiate_task_in_db(task_type=operator.task_type) run_info: TaskOperatorRunInfo = await operator.run_task(task_id) diff --git a/hugging_face/HuggingFaceInterface.py b/hugging_face/HuggingFaceInterface.py index 9ad11d0b..3dff8ccd 100644 --- a/hugging_face/HuggingFaceInterface.py +++ b/hugging_face/HuggingFaceInterface.py @@ -1,5 +1,6 @@ import asyncio import json +import os import sys from typing import List @@ -17,17 +18,22 @@ async def get_url_relevancy_async(urls_with_html: List[URLWithHTML]) -> List[boo stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, + env=os.environ.copy(), # ⬅️ ensure env variables are inherited ) stdout, stderr = await proc.communicate(input=input_data.encode("utf-8")) + print(stderr) raw_output = stdout.decode("utf-8").strip() + if proc.returncode != 0: + raise RuntimeError(f"Error running HuggingFace: {stderr}/{raw_output}") + # Try to extract the actual JSON line for line in raw_output.splitlines(): try: return json.loads(line) - except json.JSONDecodeError: + except json.JSONDecodeError as e: continue raise RuntimeError(f"Could not parse JSON from subprocess: {raw_output}") diff --git a/hugging_face/relevancy_worker.py b/hugging_face/relevancy_worker.py index 5d07d10f..dd158898 100644 --- a/hugging_face/relevancy_worker.py +++ b/hugging_face/relevancy_worker.py @@ -1,3 +1,4 @@ +import os import sys import json from transformers import pipeline @@ -7,6 +8,13 @@ def main(): pipe = pipeline("text-classification", model="PDAP/url-relevance") results = pipe(urls) + + print("Executable:", sys.executable, file=sys.stderr) + print("sys.path:", sys.path, file=sys.stderr) + print("PYTHONPATH:", os.getenv("PYTHONPATH"), file=sys.stderr) + + if len(results) != len(urls): + raise RuntimeError(f"Expected {len(urls)} results, got {len(results)}") bools = [r["score"] >= 0.5 for r in results] print(json.dumps(bools)) diff --git a/local_database/classes/DockerContainer.py b/local_database/classes/DockerContainer.py index ee2ecba9..33b71ce0 100644 --- a/local_database/classes/DockerContainer.py +++ b/local_database/classes/DockerContainer.py @@ -17,6 +17,12 @@ def run_command(self, command: str): def stop(self): self.container.stop() + def log_to_file(self): + logs = self.container.logs(stdout=True, stderr=True) + container_name = self.container.name + with open(f"{container_name}.log", "wb") as f: + f.write(logs) + def wait_for_pg_to_be_ready(self): for i in range(30): exit_code, output = self.container.exec_run("pg_isready") diff --git a/tests/helpers/DBDataCreator.py b/tests/helpers/DBDataCreator.py index 71338d84..38d70cfe 100644 --- a/tests/helpers/DBDataCreator.py +++ b/tests/helpers/DBDataCreator.py @@ -28,7 +28,7 @@ class URLCreationInfo(BaseModel): url_mappings: list[URLMapping] outcome: URLStatus - annotation_info: AnnotationInfo + annotation_info: Optional[AnnotationInfo] = None class BatchURLCreationInfoV2(BaseModel): batch_id: int @@ -109,7 +109,7 @@ async def batch_v2( d[url_parameters.status] = URLCreationInfo( url_mappings=iui.url_mappings, outcome=url_parameters.status, - annotation_info=url_parameters.annotation_info + annotation_info=url_parameters.annotation_info if url_parameters.annotation_info.has_annotations() else None ) return BatchURLCreationInfoV2( batch_id=batch_id, diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index cd9556cb..87c8efe0 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -5,9 +5,10 @@ import pytest from aiohttp import ClientSession +from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse from collector_db.models import Agency, AutomatedUrlAgencySuggestion -from collector_manager.enums import CollectorType +from collector_manager.enums import CollectorType, URLStatus from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.classes.task_operators.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator @@ -20,7 +21,7 @@ from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo from pdap_api_client.PDAPClient import PDAPClient from pdap_api_client.enums import MatchAgencyResponseStatus -from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo +from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo, BatchURLCreationInfoV2 sample_agency_suggestions = [ URLAgencySuggestionInfo( @@ -103,8 +104,25 @@ async def mock_run_subtask( CollectorType.MUCKROCK_ALL_SEARCH, CollectorType.CKAN ]: - creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(strategy=strategy, url_count=1, with_html_content=True) - d[strategy] = creation_info.url_ids[0] + # Create two URLs for each, one pending and one errored + creation_info: BatchURLCreationInfoV2 = await db_data_creator.batch_v2( + parameters=TestBatchCreationParameters( + strategy=strategy, + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.PENDING, + with_html_content=True + ), + TestURLCreationParameters( + count=1, + status=URLStatus.ERROR, + with_html_content=True + ) + ] + ) + ) + d[strategy] = creation_info.url_creation_infos[URLStatus.PENDING].url_mappings[0].url_id # Confirm meets prerequisites From bd27536cb9f6875fa378a0524b0282d08a93a834 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 12 May 2025 18:37:30 -0400 Subject: [PATCH 178/182] fix(app): Address bug in agency identification Previously, agency identification was erroneously pulling up URLs that had an error status. This has been addressed. --- .../integration/tasks/test_agency_preannotation_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 87c8efe0..6818c683 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -5,7 +5,7 @@ import pytest from aiohttp import ClientSession -from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters +from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse from collector_db.models import Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType, URLStatus From dd643b648cb79237bfd42707202db8dad6218bd3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 13 May 2025 08:16:06 -0400 Subject: [PATCH 179/182] feat(app): add url duplicate check task operator --- ..._create_url_checked_for_duplicate_table.py | 78 +++++++++++++++ collector_db/AsyncDatabaseClient.py | 47 ++++++++- collector_db/enums.py | 1 + collector_db/models.py | 14 +++ .../DTOs/task_data_objects/URLDuplicateTDO.py | 9 ++ core/TaskManager.py | 9 ++ .../URLDuplicateTaskOperator.py | 33 +++++++ pdap_api_client/DTOs.py | 2 +- pdap_api_client/PDAPClient.py | 11 +-- .../integration/tasks/conftest.py | 20 ++++ .../tasks/test_submit_approved_url_task.py | 13 +-- .../tasks/test_url_duplicate_task.py | 98 +++++++++++++++++++ 12 files changed, 314 insertions(+), 21 deletions(-) create mode 100644 alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py create mode 100644 core/DTOs/task_data_objects/URLDuplicateTDO.py create mode 100644 core/classes/task_operators/URLDuplicateTaskOperator.py create mode 100644 tests/test_automated/integration/tasks/conftest.py create mode 100644 tests/test_automated/integration/tasks/test_url_duplicate_task.py diff --git a/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py b/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py new file mode 100644 index 00000000..2719d33c --- /dev/null +++ b/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py @@ -0,0 +1,78 @@ +"""Create url_checked_for_duplicate table + +Revision ID: 864107b703ae +Revises: 9d4002437ebe +Create Date: 2025-05-13 07:04:22.592396 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '864107b703ae' +down_revision: Union[str, None] = '9d4002437ebe' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'url_checked_for_duplicate', + sa.Column( + 'id', + sa.Integer(), + primary_key=True + ), + sa.Column( + 'url_id', + sa.Integer(), + sa.ForeignKey( + 'urls.id', + ondelete='CASCADE' + ), + nullable=False + ), + sa.Column( + 'created_at', + sa.DateTime(), + nullable=False, + server_default=sa.text('now()') + ), + ) + + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + "HTML", + "Relevancy", + "Record Type", + "Agency Identification", + "Misc Metadata", + "Submit Approved URLs", + "Duplicate Detection" + ] + ) + + +def downgrade() -> None: + op.drop_table('url_checked_for_duplicate') + + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + "HTML", + "Relevancy", + "Record Type", + "Agency Identification", + "Misc Metadata", + "Submit Approved URLs", + ] + ) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 5d28f70f..03c652c9 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -29,7 +29,7 @@ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \ - BacklogSnapshot, URLDataSource + BacklogSnapshot, URLDataSource, URLCheckedForDuplicate from collector_manager.enums import URLStatus, CollectorType from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -60,6 +60,7 @@ from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo +from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus, SuggestionType, RecordType @@ -2224,4 +2225,48 @@ async def populate_backlog_snapshot( session.add(snapshot) + @session_manager + async def has_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> bool: + query = (select( + URL.id + ).outerjoin( + URLCheckedForDuplicate, + URL.id == URLCheckedForDuplicate.url_id + ).where( + URL.outcome == URLStatus.PENDING.value, + URLCheckedForDuplicate.id == None + ).limit(1) + ) + raw_result = await session.execute(query) + result = raw_result.one_or_none() + return result is not None + + @session_manager + async def get_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> List[URLDuplicateTDO]: + query = (select( + URL + ).outerjoin( + URLCheckedForDuplicate, + URL.id == URLCheckedForDuplicate.url_id + ).where( + URL.outcome == URLStatus.PENDING.value, + URLCheckedForDuplicate.id == None + ).limit(100) + ) + + raw_result = await session.execute(query) + urls = raw_result.scalars().all() + return [URLDuplicateTDO(url=url.url, url_id=url.id) for url in urls] + + + @session_manager + async def mark_all_as_duplicates(self, session: AsyncSession, url_ids: List[int]): + query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.DUPLICATE.value) + await session.execute(query) + + @session_manager + async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]): + for url_id in url_ids: + url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id) + session.add(url_checked_for_duplicate) diff --git a/collector_db/enums.py b/collector_db/enums.py index b28b6091..d6b3ec0f 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -38,6 +38,7 @@ class TaskType(PyEnum): AGENCY_IDENTIFICATION = "Agency Identification" MISC_METADATA = "Misc Metadata" SUBMIT_APPROVED = "Submit Approved URLs" + DUPLICATE_DETECTION = "Duplicate Detection" IDLE = "Idle" class PGEnum(TypeDecorator): diff --git a/collector_db/models.py b/collector_db/models.py index b38243dd..b2a86e9c 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -141,7 +141,21 @@ class URL(Base): back_populates="url", uselist=False ) + checked_for_duplicate = relationship( + "URLCheckedForDuplicate", + uselist=False, + back_populates="url" + ) + +class URLCheckedForDuplicate(Base): + __tablename__ = 'url_checked_for_duplicate' + id = Column(Integer, primary_key=True) + url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + created_at = get_created_at_column() + + # Relationships + url = relationship("URL", uselist=False, back_populates="checked_for_duplicate") class URLOptionalDataSourceMetadata(Base): __tablename__ = 'url_optional_data_source_metadata' diff --git a/core/DTOs/task_data_objects/URLDuplicateTDO.py b/core/DTOs/task_data_objects/URLDuplicateTDO.py new file mode 100644 index 00000000..af00ce38 --- /dev/null +++ b/core/DTOs/task_data_objects/URLDuplicateTDO.py @@ -0,0 +1,9 @@ +from typing import Optional + +from pydantic import BaseModel + + +class URLDuplicateTDO(BaseModel): + url_id: int + url: str + is_duplicate: Optional[bool] = None diff --git a/core/TaskManager.py b/core/TaskManager.py index 052bdbc8..1dcc9bb5 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,5 +1,6 @@ import logging +from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo @@ -96,9 +97,17 @@ async def get_url_miscellaneous_metadata_task_operator(self): ) return operator + async def get_url_duplicate_task_operator(self): + operator = URLDuplicateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return operator + async def get_task_operators(self) -> list[TaskOperatorBase]: return [ await self.get_url_html_task_operator(), + await self.get_url_duplicate_task_operator(), # await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), diff --git a/core/classes/task_operators/URLDuplicateTaskOperator.py b/core/classes/task_operators/URLDuplicateTaskOperator.py new file mode 100644 index 00000000..32cea432 --- /dev/null +++ b/core/classes/task_operators/URLDuplicateTaskOperator.py @@ -0,0 +1,33 @@ +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import TaskType +from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase +from pdap_api_client.PDAPClient import PDAPClient + + +class URLDuplicateTaskOperator(TaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + + @property + def task_type(self): + return TaskType.DUPLICATE_DETECTION + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_not_checked_for_duplicates() + + async def inner_task_logic(self): + tdos: list[URLDuplicateTDO] = await self.adb_client.get_pending_urls_not_checked_for_duplicates() + url_ids = [tdo.url_id for tdo in tdos] + await self.link_urls_to_task(url_ids=url_ids) + for tdo in tdos: + tdo.is_duplicate = await self.pdap_client.is_url_duplicate(tdo.url) + duplicate_url_ids = [tdo.url_id for tdo in tdos if tdo.is_duplicate] + await self.adb_client.mark_all_as_duplicates(duplicate_url_ids) + await self.adb_client.mark_as_checked_for_duplicates(url_ids) diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index 93f67839..342ad948 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -25,7 +25,7 @@ class ApprovalStatus(Enum): class UniqueURLDuplicateInfo(BaseModel): original_url: str approval_status: ApprovalStatus - rejection_note: str + rejection_note: Optional[str] = None class UniqueURLResponseInfo(BaseModel): is_unique: bool diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index 24b9d98c..ad3c74ea 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -59,10 +59,10 @@ async def match_agency( ) - async def is_url_unique( + async def is_url_duplicate( self, url_to_check: str - ) -> UniqueURLResponseInfo: + ) -> bool: """ Check if a URL is unique. Returns duplicate info otherwise """ @@ -79,11 +79,8 @@ async def is_url_unique( ) response_info = await self.access_manager.make_request(request_info) duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] - is_unique = (len(duplicates) == 0) - return UniqueURLResponseInfo( - is_unique=is_unique, - duplicates=duplicates - ) + is_duplicate = (len(duplicates) != 0) + return is_duplicate async def submit_urls( self, diff --git a/tests/test_automated/integration/tasks/conftest.py b/tests/test_automated/integration/tasks/conftest.py new file mode 100644 index 00000000..6a925cc5 --- /dev/null +++ b/tests/test_automated/integration/tasks/conftest.py @@ -0,0 +1,20 @@ +from unittest.mock import MagicMock, AsyncMock + +import pytest + +from pdap_api_client.AccessManager import AccessManager +from pdap_api_client.PDAPClient import PDAPClient + + +@pytest.fixture +def mock_pdap_client() -> PDAPClient: + mock_access_manager = MagicMock( + spec=AccessManager + ) + mock_access_manager.jwt_header = AsyncMock( + return_value={"Authorization": "Bearer token"} + ) + pdap_client = PDAPClient( + access_manager=mock_access_manager + ) + return pdap_client \ No newline at end of file diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index 32dc765c..c8aa86eb 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -46,18 +46,7 @@ def mock_make_request(pdap_client: PDAPClient, urls: list[str]): ) ) -@pytest.fixture -def mock_pdap_client() -> PDAPClient: - mock_access_manager = MagicMock( - spec=AccessManager - ) - mock_access_manager.jwt_header = AsyncMock( - return_value={"Authorization": "Bearer token"} - ) - pdap_client = PDAPClient( - access_manager=mock_access_manager - ) - return pdap_client + async def setup_validated_urls(db_data_creator: DBDataCreator) -> list[str]: creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls( diff --git a/tests/test_automated/integration/tasks/test_url_duplicate_task.py b/tests/test_automated/integration/tasks/test_url_duplicate_task.py new file mode 100644 index 00000000..886e91a3 --- /dev/null +++ b/tests/test_automated/integration/tasks/test_url_duplicate_task.py @@ -0,0 +1,98 @@ +from http import HTTPStatus +from unittest.mock import MagicMock + +import pytest + +from collector_db.DTOs.URLMapping import URLMapping +from collector_db.models import URL, URLCheckedForDuplicate +from collector_manager.enums import CollectorType, URLStatus +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome +from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator +from helpers.DBDataCreator import DBDataCreator +from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters +from pdap_api_client.DTOs import ResponseInfo +from pdap_api_client.PDAPClient import PDAPClient + + +@pytest.mark.asyncio +async def test_url_duplicate_task( + db_data_creator: DBDataCreator, + mock_pdap_client: PDAPClient +): + + + operator = URLDuplicateTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) + + assert not await operator.meets_task_prerequisites() + make_request_mock: MagicMock = mock_pdap_client.access_manager.make_request + + make_request_mock.assert_not_called() + + # Add three URLs to the database, one of which is in error, the other two pending + creation_info = await db_data_creator.batch_v2( + parameters=TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=1, + status=URLStatus.ERROR + ), + TestURLCreationParameters( + count=2, + status=URLStatus.PENDING + ), + ] + ) + ) + pending_urls: list[URLMapping] = creation_info.url_creation_infos[URLStatus.PENDING].url_mappings + duplicate_url = pending_urls[0] + non_duplicate_url = pending_urls[1] + assert await operator.meets_task_prerequisites() + make_request_mock.assert_not_called() + + make_request_mock.side_effect = [ + ResponseInfo( + data={ + "duplicates": [ + { + "original_url": duplicate_url.url, + "approval_status": "approved" + } + ], + }, + status_code=HTTPStatus.OK + ), + ResponseInfo( + data={ + "duplicates": [], + }, + status_code=HTTPStatus.OK + ), + ] + run_info = await operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message + assert make_request_mock.call_count == 2 + + adb_client = db_data_creator.adb_client + urls: list[URL] = await adb_client.get_all(URL) + assert len(urls) == 3 + url_ids = [url.id for url in urls] + assert duplicate_url.url_id in url_ids + for url in urls: + if url.id == duplicate_url.url_id: + assert url.outcome == URLStatus.DUPLICATE.value + + checked_for_duplicates: list[URLCheckedForDuplicate] = await adb_client.get_all(URLCheckedForDuplicate) + assert len(checked_for_duplicates) == 2 + checked_for_duplicate_url_ids = [url.url_id for url in checked_for_duplicates] + assert duplicate_url.url_id in checked_for_duplicate_url_ids + assert non_duplicate_url.url_id in checked_for_duplicate_url_ids + + assert not await operator.meets_task_prerequisites() + + + + + From 845cb1be00e418202d255c1656f4b85a410e321a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 13 May 2025 08:19:27 -0400 Subject: [PATCH 180/182] feat(app): add url duplicate check task operator --- .../integration/tasks/test_url_duplicate_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_automated/integration/tasks/test_url_duplicate_task.py b/tests/test_automated/integration/tasks/test_url_duplicate_task.py index 886e91a3..1b3e77d8 100644 --- a/tests/test_automated/integration/tasks/test_url_duplicate_task.py +++ b/tests/test_automated/integration/tasks/test_url_duplicate_task.py @@ -8,8 +8,8 @@ from collector_manager.enums import CollectorType, URLStatus from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator -from helpers.DBDataCreator import DBDataCreator -from helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters +from tests.helpers.DBDataCreator import DBDataCreator +from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters from pdap_api_client.DTOs import ResponseInfo from pdap_api_client.PDAPClient import PDAPClient From b4326d6fdf399f71b3fff5edc3b7ca9629e62293 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 13 May 2025 09:59:36 -0400 Subject: [PATCH 181/182] feat(app): replace in-project access manager with `pdap_access_manager` --- ..._create_url_checked_for_duplicate_table.py | 3 + api/main.py | 3 +- pdap_api_client/AccessManager.py | 159 --------------- pdap_api_client/DTOs.py | 47 ----- pdap_api_client/PDAPClient.py | 88 ++++---- pyproject.toml | 4 +- .../manual/pdap_client/test_access_manager.py | 6 +- tests/manual/pdap_client/test_pdap_client.py | 22 +- .../integration/tasks/conftest.py | 5 +- .../tasks/test_agency_preannotation_task.py | 2 +- .../tasks/test_submit_approved_url_task.py | 16 +- .../tasks/test_url_duplicate_task.py | 4 +- uv.lock | 190 ++++++++++++------ 13 files changed, 224 insertions(+), 325 deletions(-) delete mode 100644 pdap_api_client/AccessManager.py diff --git a/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py b/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py index 2719d33c..e2e5947f 100644 --- a/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py +++ b/alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py @@ -63,6 +63,9 @@ def upgrade() -> None: def downgrade() -> None: op.drop_table('url_checked_for_duplicate') + # Delete tasks of type "Duplicate Detection" + op.execute("DELETE FROM TASKS WHERE TASK_TYPE = 'Duplicate Detection';") + switch_enum_type( table_name='tasks', column_name='task_type', diff --git a/api/main.py b/api/main.py index 94b52cd2..eeb3e8a8 100644 --- a/api/main.py +++ b/api/main.py @@ -1,4 +1,3 @@ -import asyncio from contextlib import asynccontextmanager import aiohttp @@ -28,7 +27,7 @@ from html_tag_collector.RootURLCache import RootURLCache from html_tag_collector.URLRequestInterface import URLRequestInterface from hugging_face.HuggingFaceInterface import HuggingFaceInterface -from pdap_api_client.AccessManager import AccessManager +from pdap_access_manager import AccessManager from pdap_api_client.PDAPClient import PDAPClient from util.DiscordNotifier import DiscordPoster diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py deleted file mode 100644 index aadd8451..00000000 --- a/pdap_api_client/AccessManager.py +++ /dev/null @@ -1,159 +0,0 @@ -from http import HTTPStatus -from typing import Optional - -import requests -from aiohttp import ClientSession - -from core.EnvVarManager import EnvVarManager -from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo - -request_methods = { - RequestType.POST: ClientSession.post, - RequestType.PUT: ClientSession.put, - RequestType.GET: ClientSession.get, - RequestType.DELETE: ClientSession.delete, -} - - -class CustomHTTPException(Exception): - pass - - -def build_url( - namespace: Namespaces, - subdomains: Optional[list[str]] = None -): - api_url = EnvVarManager.get().pdap_api_url - url = f"{api_url}/{namespace.value}" - if subdomains is not None: - url = f"{url}/{'/'.join(subdomains)}" - return url - - -class AccessManager: - """ - Manages login, api key, access and refresh tokens - """ - def __init__( - self, - email: str, - password: str, - session: Optional[ClientSession] = None, - api_key: Optional[str] = None, - ): - self.session = session - self._access_token = None - self._refresh_token = None - self.api_key = api_key - self.email = email - self.password = password - - @property - async def access_token(self): - if self._access_token is None: - await self.login( - email=self.email, - password=self.password - ) - return self._access_token - - @property - async def refresh_token(self): - if self._refresh_token is None: - await self.login( - email=self.email, - password=self.password - ) - return self._refresh_token - - # TODO: Add means to refresh if token expired. - - async def load_api_key(self): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["api-key"] - ) - request_info = RequestInfo( - type_ = RequestType.POST, - url=url, - headers=await self.jwt_header() - ) - response_info = await self.make_request(request_info) - self.api_key = response_info.data["api_key"] - - async def refresh_access_token(self): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["refresh-session"], - ) - refresh_token = await self.refresh_token - rqi = RequestInfo( - type_=RequestType.POST, - url=url, - json={"refresh_token": refresh_token}, - headers=await self.jwt_header() - ) - rsi = await self.make_request(rqi) - data = rsi.data - self._access_token = data['access_token'] - self._refresh_token = data['refresh_token'] - - async def make_request(self, ri: RequestInfo) -> ResponseInfo: - try: - method = getattr(self.session, ri.type_.value.lower()) - async with method(**ri.kwargs()) as response: - response.raise_for_status() - json = await response.json() - return ResponseInfo( - status_code=HTTPStatus(response.status), - data=json - ) - except requests.RequestException as e: - # TODO: Precise string matching here is brittle. Consider changing later. - if json.message == "Token is expired. Please request a new token.": - await self.refresh_access_token() - return await self.make_request(ri) - else: - raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") - - - async def login(self, email: str, password: str): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["login"] - ) - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - json={ - "email": email, - "password": password - } - ) - response_info = await self.make_request(request_info) - data = response_info.data - self._access_token = data["access_token"] - self._refresh_token = data["refresh_token"] - - - async def jwt_header(self) -> dict: - """ - Retrieve JWT header - Returns: Dictionary of Bearer Authorization with JWT key - """ - access_token = await self.access_token - return { - "Authorization": f"Bearer {access_token}" - } - - def api_key_header(self): - """ - Retrieve API key header - Returns: Dictionary of Basic Authorization with API key - - """ - if self.api_key is None: - self.load_api_key() - return { - "Authorization": f"Basic {self.api_key}" - } diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index 342ad948..23d240d7 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -1,5 +1,4 @@ from enum import Enum -from http import HTTPStatus from typing import Optional, List from pydantic import BaseModel @@ -20,57 +19,11 @@ class ApprovalStatus(Enum): PENDING = "pending" NEEDS_IDENTIFICATION = "needs identification" - - class UniqueURLDuplicateInfo(BaseModel): original_url: str approval_status: ApprovalStatus rejection_note: Optional[str] = None -class UniqueURLResponseInfo(BaseModel): - is_unique: bool - duplicates: list[UniqueURLDuplicateInfo] - - -class Namespaces(Enum): - AUTH = "auth" - MATCH = "match" - CHECK = "check" - DATA_SOURCES = "data-sources" - SOURCE_COLLECTOR = "source-collector" - - -class RequestType(Enum): - POST = "POST" - PUT = "PUT" - GET = "GET" - DELETE = "DELETE" - - -class RequestInfo(BaseModel): - type_: RequestType - url: str - json: Optional[dict] = None - headers: Optional[dict] = None - params: Optional[dict] = None - timeout: Optional[int] = 10 - - def kwargs(self) -> dict: - d = { - "url": self.url, - } - if self.json is not None: - d['json'] = self.json - if self.headers is not None: - d['headers'] = self.headers - return d - - -class ResponseInfo(BaseModel): - status_code: HTTPStatus - data: Optional[dict] - - class MatchAgencyResponse(BaseModel): status: MatchAgencyResponseStatus matches: List[MatchAgencyInfo] diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index ad3c74ea..491b7c3b 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,41 +1,42 @@ from typing import Optional from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo -from pdap_api_client.AccessManager import build_url, AccessManager -from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ - RequestType, RequestInfo, MatchAgencyResponse +from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, \ + MatchAgencyResponse from pdap_api_client.enums import MatchAgencyResponseStatus +from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType class PDAPClient: def __init__( - self, - access_manager: AccessManager, + self, + access_manager: AccessManager, ): self.access_manager = access_manager async def match_agency( - self, - name: str, - state: Optional[str] = None, - county: Optional[str] = None, - locality: Optional[str] = None + self, + name: str, + state: Optional[str] = None, + county: Optional[str] = None, + locality: Optional[str] = None ) -> MatchAgencyResponse: """ Returns agencies, if any, that match or partially match the search criteria """ - url = build_url( - namespace=Namespaces.MATCH, + url = self.access_manager.build_url( + namespace=DataSourcesNamespaces.MATCH, subdomains=["agency"] ) + headers = await self.access_manager.jwt_header() headers['Content-Type'] = "application/json" request_info = RequestInfo( type_=RequestType.POST, url=url, headers=headers, - json={ + json_={ "name": name, "state": state, "county": county, @@ -43,22 +44,24 @@ async def match_agency( } ) response_info = await self.access_manager.make_request(request_info) - - matches = [ - MatchAgencyInfo( - id = agency['id'], - submitted_name=agency['name'], - state=agency['state'], - county=agency['county'], - locality=agency['locality'] + matches = [] + for agency in response_info.data["agencies"]: + mai = MatchAgencyInfo( + id=agency['id'], + submitted_name=agency['name'] ) - for agency in response_info.data["agencies"]] + if len(agency['locations']) > 0: + first_location = agency['locations'][0] + mai.state = first_location['state'] + mai.county = first_location['county'] + mai.locality = first_location['locality'] + matches.append(mai) + return MatchAgencyResponse( status=MatchAgencyResponseStatus(response_info.data["status"]), matches=matches ) - async def is_url_duplicate( self, url_to_check: str @@ -66,8 +69,8 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url = build_url( - namespace=Namespaces.CHECK, + url = self.access_manager.build_url( + namespace=DataSourcesNamespaces.CHECK, subdomains=["unique-url"] ) request_info = RequestInfo( @@ -83,15 +86,15 @@ async def is_url_duplicate( return is_duplicate async def submit_urls( - self, - tdos: list[SubmitApprovedURLTDO] + self, + tdos: list[SubmitApprovedURLTDO] ) -> list[SubmittedURLInfo]: """ Submits URLs to Data Sources App, modifying tdos in-place with data source id or error """ - request_url = build_url( - namespace=Namespaces.SOURCE_COLLECTOR, + request_url = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=["data-sources"] ) @@ -102,25 +105,26 @@ async def submit_urls( data_sources_json = [] for tdo in tdos: - data_sources_json.append({ - "name": tdo.name, - "description": tdo.description, - "source_url": tdo.url, - "record_type": tdo.record_type.value, - "record_formats": tdo.record_formats, - "data_portal_type": tdo.data_portal_type, - "last_approval_editor": tdo.approving_user_id, - "supplying_entity": tdo.supplying_entity, - "agency_ids": tdo.agency_ids - }) - + data_sources_json.append( + { + "name": tdo.name, + "description": tdo.description, + "source_url": tdo.url, + "record_type": tdo.record_type.value, + "record_formats": tdo.record_formats, + "data_portal_type": tdo.data_portal_type, + "last_approval_editor": tdo.approving_user_id, + "supplying_entity": tdo.supplying_entity, + "agency_ids": tdo.agency_ids + } + ) headers = await self.access_manager.jwt_header() request_info = RequestInfo( type_=RequestType.POST, url=request_url, headers=headers, - json={ + json_={ "data_sources": data_sources_json } ) diff --git a/pyproject.toml b/pyproject.toml index 5d2269c7..8a2b1187 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,10 +24,11 @@ dependencies = [ "numpy~=1.26.4", "openai~=1.60.1", "pandas~=2.2.3", + "pdap-access-manager==0.3.5", "playwright~=1.49.1", "psycopg2-binary~=2.9.6", "psycopg[binary]~=3.1.20", - "pydantic~=2.10.6", + "pydantic~=2.11.3", "pyjwt~=2.10.1", "python-dotenv~=1.0.1", "requests~=2.32.3", @@ -43,6 +44,7 @@ dependencies = [ [dependency-groups] dev = [ + "deepdiff>=8.5.0", "docker>=7.1.0", "pendulum>=3.1.0", "pytest>=7.2.2", diff --git a/tests/manual/pdap_client/test_access_manager.py b/tests/manual/pdap_client/test_access_manager.py index ff08ee0e..b1245eca 100644 --- a/tests/manual/pdap_client/test_access_manager.py +++ b/tests/manual/pdap_client/test_access_manager.py @@ -1,7 +1,7 @@ import pytest from aiohttp import ClientSession -from pdap_api_client.AccessManager import AccessManager +from pdap_access_manager import AccessManager from util.helper_functions import get_from_env @@ -9,8 +9,8 @@ async def test_refresh_session(): async with ClientSession() as session: access_manager = AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), + email=get_from_env("PDAP_PROD_EMAIL"), + password=get_from_env("PDAP_PROD_PASSWORD"), api_key=get_from_env("PDAP_API_KEY", allow_none=True), session=session ) diff --git a/tests/manual/pdap_client/test_pdap_client.py b/tests/manual/pdap_client/test_pdap_client.py index b1232244..5d10037c 100644 --- a/tests/manual/pdap_client/test_pdap_client.py +++ b/tests/manual/pdap_client/test_pdap_client.py @@ -1,7 +1,7 @@ import pytest from aiohttp import ClientSession -from pdap_api_client.AccessManager import AccessManager +from pdap_access_manager import AccessManager from pdap_api_client.PDAPClient import PDAPClient from util.helper_functions import get_from_env @@ -11,8 +11,8 @@ async def test_match_agency(): async with ClientSession() as session: access_manager = AccessManager( - email=get_from_env("PDAP_EMAIL"), - password=get_from_env("PDAP_PASSWORD"), + email=get_from_env("PDAP_PROD_EMAIL"), + password=get_from_env("PDAP_PROD_PASSWORD"), api_key=get_from_env("PDAP_API_KEY", allow_none=True), session=session ) @@ -21,3 +21,19 @@ async def test_match_agency(): response = await pdap_client.match_agency(name="police") print(response) + +@pytest.mark.asyncio +async def test_check_for_duplicate(): + + async with ClientSession() as session: + access_manager = AccessManager( + email=get_from_env("PDAP_PROD_EMAIL"), + password=get_from_env("PDAP_PROD_PASSWORD"), + api_key=get_from_env("PDAP_API_KEY", allow_none=True), + session=session + ) + pdap_client = PDAPClient(access_manager=access_manager) + + response = await pdap_client.is_url_duplicate(url_to_check="https://example.com") + + print(response) \ No newline at end of file diff --git a/tests/test_automated/integration/tasks/conftest.py b/tests/test_automated/integration/tasks/conftest.py index 6a925cc5..a4136b20 100644 --- a/tests/test_automated/integration/tasks/conftest.py +++ b/tests/test_automated/integration/tasks/conftest.py @@ -2,7 +2,7 @@ import pytest -from pdap_api_client.AccessManager import AccessManager +from pdap_access_manager import AccessManager from pdap_api_client.PDAPClient import PDAPClient @@ -11,6 +11,9 @@ def mock_pdap_client() -> PDAPClient: mock_access_manager = MagicMock( spec=AccessManager ) + mock_access_manager.build_url = MagicMock( + return_value="http://example.com" + ) mock_access_manager.jwt_header = AsyncMock( return_value={"Authorization": "Bearer token"} ) diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 6818c683..e6278292 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -17,7 +17,7 @@ from core.classes.subtasks.CommonCrawlerAgencyIdentificationSubtask import CommonCrawlerAgencyIdentificationSubtask from core.classes.subtasks.MuckrockAgencyIdentificationSubtask import MuckrockAgencyIdentificationSubtask from core.enums import SuggestionType -from pdap_api_client.AccessManager import AccessManager +from pdap_access_manager import AccessManager from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo from pdap_api_client.PDAPClient import PDAPClient from pdap_api_client.enums import MatchAgencyResponseStatus diff --git a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py index c8aa86eb..1477915f 100644 --- a/tests/test_automated/integration/tasks/test_submit_approved_url_task.py +++ b/tests/test_automated/integration/tasks/test_submit_approved_url_task.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock, AsyncMock import pytest +from deepdiff import DeepDiff from collector_db.enums import TaskType from collector_db.models import URL, URLErrorInfo, URLDataSource @@ -11,8 +12,7 @@ from core.classes.task_operators.SubmitApprovedURLTaskOperator import SubmitApprovedURLTaskOperator from core.enums import RecordType, SubmitResponseStatus from tests.helpers.DBDataCreator import BatchURLCreationInfo, DBDataCreator -from pdap_api_client.AccessManager import AccessManager -from pdap_api_client.DTOs import RequestInfo, RequestType, ResponseInfo +from pdap_access_manager import RequestInfo, RequestType, ResponseInfo, DataSourcesNamespaces from pdap_api_client.PDAPClient import PDAPClient @@ -164,13 +164,17 @@ async def test_submit_approved_url_task( # Check mock method was called expected parameters access_manager = mock_pdap_client.access_manager access_manager.make_request.assert_called_once() + access_manager.build_url.assert_called_with( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=['data-sources'] + ) call_1 = access_manager.make_request.call_args_list[0][0][0] expected_call_1 = RequestInfo( type_=RequestType.POST, - url="TEST/source-collector/data-sources", + url="http://example.com", headers=access_manager.jwt_header.return_value, - json={ + json_={ "data_sources": [ { "name": "URL 1 Name", @@ -209,6 +213,6 @@ async def test_submit_approved_url_task( } ) assert call_1.type_ == expected_call_1.type_ - assert call_1.url == expected_call_1.url assert call_1.headers == expected_call_1.headers - assert call_1.json == expected_call_1.json + diff = DeepDiff(call_1.json_, expected_call_1.json_, ignore_order=True) + assert diff == {}, f"Differences found: {diff}" diff --git a/tests/test_automated/integration/tasks/test_url_duplicate_task.py b/tests/test_automated/integration/tasks/test_url_duplicate_task.py index 1b3e77d8..d66cfe27 100644 --- a/tests/test_automated/integration/tasks/test_url_duplicate_task.py +++ b/tests/test_automated/integration/tasks/test_url_duplicate_task.py @@ -5,12 +5,12 @@ from collector_db.DTOs.URLMapping import URLMapping from collector_db.models import URL, URLCheckedForDuplicate -from collector_manager.enums import CollectorType, URLStatus +from collector_manager.enums import URLStatus from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator from tests.helpers.DBDataCreator import DBDataCreator from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters -from pdap_api_client.DTOs import ResponseInfo +from pdap_access_manager import ResponseInfo from pdap_api_client.PDAPClient import PDAPClient diff --git a/uv.lock b/uv.lock index bb269479..773fee9e 100644 --- a/uv.lock +++ b/uv.lock @@ -218,6 +218,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" }, ] +[[package]] +name = "boltons" +version = "25.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/63/54/71a94d8e02da9a865587fb3fff100cb0fc7aa9f4d5ed9ed3a591216ddcc7/boltons-25.0.0.tar.gz", hash = "sha256:e110fbdc30b7b9868cb604e3f71d4722dd8f4dcb4a5ddd06028ba8f1ab0b5ace", size = 246294, upload_time = "2025-02-03T05:57:59.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/7f/0e961cf3908bc4c1c3e027de2794f867c6c89fb4916fc7dba295a0e80a2d/boltons-25.0.0-py3-none-any.whl", hash = "sha256:dc9fb38bf28985715497d1b54d00b62ea866eca3938938ea9043e254a3a6ca62", size = 194210, upload_time = "2025-02-03T05:57:56.705Z" }, +] + [[package]] name = "bs4" version = "0.0.2" @@ -315,14 +324,14 @@ wheels = [ [[package]] name = "click" -version = "8.1.8" +version = "8.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload_time = "2024-12-21T18:38:44.339Z" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/0f/62ca20172d4f87d93cf89665fbaedcd560ac48b465bd1d92bfc7ea6b0a41/click-8.2.0.tar.gz", hash = "sha256:f5452aeddd9988eefa20f90f05ab66f17fce1ee2a36907fd30b05bbb5953814d", size = 235857, upload_time = "2025-05-10T22:21:03.111Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload_time = "2024-12-21T18:38:41.666Z" }, + { url = "https://files.pythonhosted.org/packages/a2/58/1f37bf81e3c689cc74ffa42102fa8915b59085f54a6e4a80bc6265c0f6bf/click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c", size = 102156, upload_time = "2025-05-10T22:21:01.352Z" }, ] [[package]] @@ -360,6 +369,7 @@ dependencies = [ { name = "numpy" }, { name = "openai" }, { name = "pandas" }, + { name = "pdap-access-manager" }, { name = "playwright" }, { name = "psycopg", extra = ["binary"] }, { name = "psycopg2-binary" }, @@ -379,6 +389,7 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "deepdiff" }, { name = "docker" }, { name = "pendulum" }, { name = "pytest" }, @@ -410,10 +421,11 @@ requires-dist = [ { name = "numpy", specifier = "~=1.26.4" }, { name = "openai", specifier = "~=1.60.1" }, { name = "pandas", specifier = "~=2.2.3" }, + { name = "pdap-access-manager", specifier = "==0.3.5" }, { name = "playwright", specifier = "~=1.49.1" }, { name = "psycopg", extras = ["binary"], specifier = "~=3.1.20" }, { name = "psycopg2-binary", specifier = "~=2.9.6" }, - { name = "pydantic", specifier = "~=2.10.6" }, + { name = "pydantic", specifier = "~=2.11.3" }, { name = "pyjwt", specifier = "~=2.10.1" }, { name = "python-dotenv", specifier = "~=1.0.1" }, { name = "requests", specifier = "~=2.32.3" }, @@ -429,6 +441,7 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ + { name = "deepdiff", specifier = ">=8.5.0" }, { name = "docker", specifier = ">=7.1.0" }, { name = "pendulum", specifier = ">=3.1.0" }, { name = "pytest", specifier = ">=7.2.2" }, @@ -463,6 +476,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/59/46818ebeb708234a60e42ccf409d20709e482519d2aa450b501ddbba4594/datasets-2.19.2-py3-none-any.whl", hash = "sha256:e07ff15d75b1af75c87dd96323ba2a361128d495136652f37fd62f918d17bb4e", size = 542113, upload_time = "2024-06-03T05:11:41.151Z" }, ] +[[package]] +name = "deepdiff" +version = "8.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "orderly-set" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/0f/9cd2624f7dcd755cbf1fa21fb7234541f19a1be96a56f387ec9053ebe220/deepdiff-8.5.0.tar.gz", hash = "sha256:a4dd3529fa8d4cd5b9cbb6e3ea9c95997eaa919ba37dac3966c1b8f872dc1cd1", size = 538517, upload_time = "2025-05-09T18:44:10.035Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/3b/2e0797200c51531a6d8c97a8e4c9fa6fb56de7e6e2a15c1c067b6b10a0b0/deepdiff-8.5.0-py3-none-any.whl", hash = "sha256:d4599db637f36a1c285f5fdfc2cd8d38bde8d8be8636b65ab5e425b67c54df26", size = 85112, upload_time = "2025-05-09T18:44:07.784Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -1408,6 +1433,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/23/cd/066e86230ae37ed0be70aae89aabf03ca8d9f39c8aea0dec8029455b5540/opt_einsum-3.4.0-py3-none-any.whl", hash = "sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd", size = 71932, upload_time = "2024-09-26T14:33:23.039Z" }, ] +[[package]] +name = "orderly-set" +version = "5.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/4a/38030da31c13dcd5a531490006e63a0954083fb115113be9393179738e25/orderly_set-5.4.1.tar.gz", hash = "sha256:a1fb5a4fdc5e234e9e8d8e5c1bbdbc4540f4dfe50d12bf17c8bc5dbf1c9c878d", size = 20943, upload_time = "2025-05-06T22:34:13.512Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/bc/e0dfb4db9210d92b44e49d6e61ba5caefbd411958357fa9d7ff489eeb835/orderly_set-5.4.1-py3-none-any.whl", hash = "sha256:b5e21d21680bd9ef456885db800c5cb4f76a03879880c0175e1b077fb166fd83", size = 12339, upload_time = "2025-05-06T22:34:12.564Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -1458,6 +1492,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload_time = "2024-09-20T13:09:48.112Z" }, ] +[[package]] +name = "pdap-access-manager" +version = "0.3.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "boltons" }, + { name = "pydantic" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/54/c0e76d1d54ff2f542f18b289db96c417d3bcd7e8e948de07921b492717e7/pdap_access_manager-0.3.5.tar.gz", hash = "sha256:5f8bbe0f25ef68810a0936ca22d40d3869d77391bae3c8ba1c885f8fe74154bd", size = 4120, upload_time = "2025-05-13T13:40:24.189Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/01/d4ba10d0d7be759e59011f4235c533b1bc31d5e99db86424cfd82284ce53/pdap_access_manager-0.3.5-py3-none-any.whl", hash = "sha256:b53a006e535d7733ca884560f41aa305068fec648c89524e397967a21e69a0d0", size = 4980, upload_time = "2025-05-13T13:40:23.223Z" }, +] + [[package]] name = "pendulum" version = "3.1.0" @@ -1793,69 +1842,82 @@ wheels = [ [[package]] name = "pydantic" -version = "2.10.6" +version = "2.11.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, { name = "typing-extensions" }, + { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681, upload_time = "2025-01-24T01:42:12.693Z" } +sdist = { url = "https://files.pythonhosted.org/packages/10/2e/ca897f093ee6c5f3b0bee123ee4465c50e75431c3d5b6a3b44a47134e891/pydantic-2.11.3.tar.gz", hash = "sha256:7471657138c16adad9322fe3070c0116dd6c3ad8d649300e3cbdfe91f4db4ec3", size = 785513, upload_time = "2025-04-08T13:27:06.399Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696, upload_time = "2025-01-24T01:42:10.371Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1d/407b29780a289868ed696d1616f4aad49d6388e5a77f567dcd2629dcd7b8/pydantic-2.11.3-py3-none-any.whl", hash = "sha256:a082753436a07f9ba1289c6ffa01cd93db3548776088aa917cc43b63f68fa60f", size = 443591, upload_time = "2025-04-08T13:27:03.789Z" }, ] [[package]] name = "pydantic-core" -version = "2.27.2" +version = "2.33.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443, upload_time = "2024-12-18T11:31:54.917Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/89/f3450af9d09d44eea1f2c369f49e8f181d742f28220f88cc4dfaae91ea6e/pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc", size = 1893421, upload_time = "2024-12-18T11:27:55.409Z" }, - { url = "https://files.pythonhosted.org/packages/9e/e3/71fe85af2021f3f386da42d291412e5baf6ce7716bd7101ea49c810eda90/pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7", size = 1814998, upload_time = "2024-12-18T11:27:57.252Z" }, - { url = "https://files.pythonhosted.org/packages/a6/3c/724039e0d848fd69dbf5806894e26479577316c6f0f112bacaf67aa889ac/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15", size = 1826167, upload_time = "2024-12-18T11:27:59.146Z" }, - { url = "https://files.pythonhosted.org/packages/2b/5b/1b29e8c1fb5f3199a9a57c1452004ff39f494bbe9bdbe9a81e18172e40d3/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306", size = 1865071, upload_time = "2024-12-18T11:28:02.625Z" }, - { url = "https://files.pythonhosted.org/packages/89/6c/3985203863d76bb7d7266e36970d7e3b6385148c18a68cc8915fd8c84d57/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99", size = 2036244, upload_time = "2024-12-18T11:28:04.442Z" }, - { url = "https://files.pythonhosted.org/packages/0e/41/f15316858a246b5d723f7d7f599f79e37493b2e84bfc789e58d88c209f8a/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459", size = 2737470, upload_time = "2024-12-18T11:28:07.679Z" }, - { url = "https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048", size = 1992291, upload_time = "2024-12-18T11:28:10.297Z" }, - { url = "https://files.pythonhosted.org/packages/bf/73/42c3742a391eccbeab39f15213ecda3104ae8682ba3c0c28069fbcb8c10d/pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d", size = 1994613, upload_time = "2024-12-18T11:28:13.362Z" }, - { url = "https://files.pythonhosted.org/packages/94/7a/941e89096d1175d56f59340f3a8ebaf20762fef222c298ea96d36a6328c5/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b", size = 2002355, upload_time = "2024-12-18T11:28:16.587Z" }, - { url = "https://files.pythonhosted.org/packages/6e/95/2359937a73d49e336a5a19848713555605d4d8d6940c3ec6c6c0ca4dcf25/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474", size = 2126661, upload_time = "2024-12-18T11:28:18.407Z" }, - { url = "https://files.pythonhosted.org/packages/2b/4c/ca02b7bdb6012a1adef21a50625b14f43ed4d11f1fc237f9d7490aa5078c/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6", size = 2153261, upload_time = "2024-12-18T11:28:21.471Z" }, - { url = "https://files.pythonhosted.org/packages/72/9d/a241db83f973049a1092a079272ffe2e3e82e98561ef6214ab53fe53b1c7/pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c", size = 1812361, upload_time = "2024-12-18T11:28:23.53Z" }, - { url = "https://files.pythonhosted.org/packages/e8/ef/013f07248041b74abd48a385e2110aa3a9bbfef0fbd97d4e6d07d2f5b89a/pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc", size = 1982484, upload_time = "2024-12-18T11:28:25.391Z" }, - { url = "https://files.pythonhosted.org/packages/10/1c/16b3a3e3398fd29dca77cea0a1d998d6bde3902fa2706985191e2313cc76/pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4", size = 1867102, upload_time = "2024-12-18T11:28:28.593Z" }, - { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127, upload_time = "2024-12-18T11:28:30.346Z" }, - { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340, upload_time = "2024-12-18T11:28:32.521Z" }, - { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900, upload_time = "2024-12-18T11:28:34.507Z" }, - { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177, upload_time = "2024-12-18T11:28:36.488Z" }, - { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046, upload_time = "2024-12-18T11:28:39.409Z" }, - { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386, upload_time = "2024-12-18T11:28:41.221Z" }, - { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060, upload_time = "2024-12-18T11:28:44.709Z" }, - { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870, upload_time = "2024-12-18T11:28:46.839Z" }, - { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822, upload_time = "2024-12-18T11:28:48.896Z" }, - { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364, upload_time = "2024-12-18T11:28:50.755Z" }, - { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303, upload_time = "2024-12-18T11:28:54.122Z" }, - { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064, upload_time = "2024-12-18T11:28:56.074Z" }, - { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046, upload_time = "2024-12-18T11:28:58.107Z" }, - { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092, upload_time = "2024-12-18T11:29:01.335Z" }, - { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709, upload_time = "2024-12-18T11:29:03.193Z" }, - { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273, upload_time = "2024-12-18T11:29:05.306Z" }, - { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027, upload_time = "2024-12-18T11:29:07.294Z" }, - { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888, upload_time = "2024-12-18T11:29:09.249Z" }, - { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738, upload_time = "2024-12-18T11:29:11.23Z" }, - { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138, upload_time = "2024-12-18T11:29:16.396Z" }, - { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025, upload_time = "2024-12-18T11:29:20.25Z" }, - { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633, upload_time = "2024-12-18T11:29:23.877Z" }, - { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404, upload_time = "2024-12-18T11:29:25.872Z" }, - { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130, upload_time = "2024-12-18T11:29:29.252Z" }, - { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946, upload_time = "2024-12-18T11:29:31.338Z" }, - { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387, upload_time = "2024-12-18T11:29:33.481Z" }, - { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453, upload_time = "2024-12-18T11:29:35.533Z" }, - { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186, upload_time = "2024-12-18T11:29:37.649Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/17/19/ed6a078a5287aea7922de6841ef4c06157931622c89c2a47940837b5eecd/pydantic_core-2.33.1.tar.gz", hash = "sha256:bcc9c6fdb0ced789245b02b7d6603e17d1563064ddcfc36f046b61c0c05dd9df", size = 434395, upload_time = "2025-04-02T09:49:41.8Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/7f/c6298830cb780c46b4f46bb24298d01019ffa4d21769f39b908cd14bbd50/pydantic_core-2.33.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e966fc3caaf9f1d96b349b0341c70c8d6573bf1bac7261f7b0ba88f96c56c24", size = 2044224, upload_time = "2025-04-02T09:47:04.199Z" }, + { url = "https://files.pythonhosted.org/packages/a8/65/6ab3a536776cad5343f625245bd38165d6663256ad43f3a200e5936afd6c/pydantic_core-2.33.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bfd0adeee563d59c598ceabddf2c92eec77abcb3f4a391b19aa7366170bd9e30", size = 1858845, upload_time = "2025-04-02T09:47:05.686Z" }, + { url = "https://files.pythonhosted.org/packages/e9/15/9a22fd26ba5ee8c669d4b8c9c244238e940cd5d818649603ca81d1c69861/pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91815221101ad3c6b507804178a7bb5cb7b2ead9ecd600041669c8d805ebd595", size = 1910029, upload_time = "2025-04-02T09:47:07.042Z" }, + { url = "https://files.pythonhosted.org/packages/d5/33/8cb1a62818974045086f55f604044bf35b9342900318f9a2a029a1bec460/pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9fea9c1869bb4742d174a57b4700c6dadea951df8b06de40c2fedb4f02931c2e", size = 1997784, upload_time = "2025-04-02T09:47:08.63Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ca/49958e4df7715c71773e1ea5be1c74544923d10319173264e6db122543f9/pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d20eb4861329bb2484c021b9d9a977566ab16d84000a57e28061151c62b349a", size = 2141075, upload_time = "2025-04-02T09:47:10.267Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a6/0b3a167a9773c79ba834b959b4e18c3ae9216b8319bd8422792abc8a41b1/pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb935c5591573ae3201640579f30128ccc10739b45663f93c06796854405505", size = 2745849, upload_time = "2025-04-02T09:47:11.724Z" }, + { url = "https://files.pythonhosted.org/packages/0b/60/516484135173aa9e5861d7a0663dce82e4746d2e7f803627d8c25dfa5578/pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c964fd24e6166420d18fb53996d8c9fd6eac9bf5ae3ec3d03015be4414ce497f", size = 2005794, upload_time = "2025-04-02T09:47:13.099Z" }, + { url = "https://files.pythonhosted.org/packages/86/70/05b1eb77459ad47de00cf78ee003016da0cedf8b9170260488d7c21e9181/pydantic_core-2.33.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:681d65e9011f7392db5aa002b7423cc442d6a673c635668c227c6c8d0e5a4f77", size = 2123237, upload_time = "2025-04-02T09:47:14.355Z" }, + { url = "https://files.pythonhosted.org/packages/c7/57/12667a1409c04ae7dc95d3b43158948eb0368e9c790be8b095cb60611459/pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e100c52f7355a48413e2999bfb4e139d2977a904495441b374f3d4fb4a170961", size = 2086351, upload_time = "2025-04-02T09:47:15.676Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/cc6d1d1c1664b58fdd6ecc64c84366c34ec9b606aeb66cafab6f4088974c/pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:048831bd363490be79acdd3232f74a0e9951b11b2b4cc058aeb72b22fdc3abe1", size = 2258914, upload_time = "2025-04-02T09:47:17Z" }, + { url = "https://files.pythonhosted.org/packages/d1/0a/edb137176a1f5419b2ddee8bde6a0a548cfa3c74f657f63e56232df8de88/pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bdc84017d28459c00db6f918a7272a5190bec3090058334e43a76afb279eac7c", size = 2257385, upload_time = "2025-04-02T09:47:18.631Z" }, + { url = "https://files.pythonhosted.org/packages/26/3c/48ca982d50e4b0e1d9954919c887bdc1c2b462801bf408613ccc641b3daa/pydantic_core-2.33.1-cp311-cp311-win32.whl", hash = "sha256:32cd11c5914d1179df70406427097c7dcde19fddf1418c787540f4b730289896", size = 1923765, upload_time = "2025-04-02T09:47:20.34Z" }, + { url = "https://files.pythonhosted.org/packages/33/cd/7ab70b99e5e21559f5de38a0928ea84e6f23fdef2b0d16a6feaf942b003c/pydantic_core-2.33.1-cp311-cp311-win_amd64.whl", hash = "sha256:2ea62419ba8c397e7da28a9170a16219d310d2cf4970dbc65c32faf20d828c83", size = 1950688, upload_time = "2025-04-02T09:47:22.029Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ae/db1fc237b82e2cacd379f63e3335748ab88b5adde98bf7544a1b1bd10a84/pydantic_core-2.33.1-cp311-cp311-win_arm64.whl", hash = "sha256:fc903512177361e868bc1f5b80ac8c8a6e05fcdd574a5fb5ffeac5a9982b9e89", size = 1908185, upload_time = "2025-04-02T09:47:23.385Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ce/3cb22b07c29938f97ff5f5bb27521f95e2ebec399b882392deb68d6c440e/pydantic_core-2.33.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1293d7febb995e9d3ec3ea09caf1a26214eec45b0f29f6074abb004723fc1de8", size = 2026640, upload_time = "2025-04-02T09:47:25.394Z" }, + { url = "https://files.pythonhosted.org/packages/19/78/f381d643b12378fee782a72126ec5d793081ef03791c28a0fd542a5bee64/pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:99b56acd433386c8f20be5c4000786d1e7ca0523c8eefc995d14d79c7a081498", size = 1852649, upload_time = "2025-04-02T09:47:27.417Z" }, + { url = "https://files.pythonhosted.org/packages/9d/2b/98a37b80b15aac9eb2c6cfc6dbd35e5058a352891c5cce3a8472d77665a6/pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35a5ec3fa8c2fe6c53e1b2ccc2454398f95d5393ab398478f53e1afbbeb4d939", size = 1892472, upload_time = "2025-04-02T09:47:29.006Z" }, + { url = "https://files.pythonhosted.org/packages/4e/d4/3c59514e0f55a161004792b9ff3039da52448f43f5834f905abef9db6e4a/pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b172f7b9d2f3abc0efd12e3386f7e48b576ef309544ac3a63e5e9cdd2e24585d", size = 1977509, upload_time = "2025-04-02T09:47:33.464Z" }, + { url = "https://files.pythonhosted.org/packages/a9/b6/c2c7946ef70576f79a25db59a576bce088bdc5952d1b93c9789b091df716/pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9097b9f17f91eea659b9ec58148c0747ec354a42f7389b9d50701610d86f812e", size = 2128702, upload_time = "2025-04-02T09:47:34.812Z" }, + { url = "https://files.pythonhosted.org/packages/88/fe/65a880f81e3f2a974312b61f82a03d85528f89a010ce21ad92f109d94deb/pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc77ec5b7e2118b152b0d886c7514a4653bcb58c6b1d760134a9fab915f777b3", size = 2679428, upload_time = "2025-04-02T09:47:37.315Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ff/4459e4146afd0462fb483bb98aa2436d69c484737feaceba1341615fb0ac/pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3d15245b08fa4a84cefc6c9222e6f37c98111c8679fbd94aa145f9a0ae23d", size = 2008753, upload_time = "2025-04-02T09:47:39.013Z" }, + { url = "https://files.pythonhosted.org/packages/7c/76/1c42e384e8d78452ededac8b583fe2550c84abfef83a0552e0e7478ccbc3/pydantic_core-2.33.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef99779001d7ac2e2461d8ab55d3373fe7315caefdbecd8ced75304ae5a6fc6b", size = 2114849, upload_time = "2025-04-02T09:47:40.427Z" }, + { url = "https://files.pythonhosted.org/packages/00/72/7d0cf05095c15f7ffe0eb78914b166d591c0eed72f294da68378da205101/pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fc6bf8869e193855e8d91d91f6bf59699a5cdfaa47a404e278e776dd7f168b39", size = 2069541, upload_time = "2025-04-02T09:47:42.01Z" }, + { url = "https://files.pythonhosted.org/packages/b3/69/94a514066bb7d8be499aa764926937409d2389c09be0b5107a970286ef81/pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:b1caa0bc2741b043db7823843e1bde8aaa58a55a58fda06083b0569f8b45693a", size = 2239225, upload_time = "2025-04-02T09:47:43.425Z" }, + { url = "https://files.pythonhosted.org/packages/84/b0/e390071eadb44b41f4f54c3cef64d8bf5f9612c92686c9299eaa09e267e2/pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ec259f62538e8bf364903a7d0d0239447059f9434b284f5536e8402b7dd198db", size = 2248373, upload_time = "2025-04-02T09:47:44.979Z" }, + { url = "https://files.pythonhosted.org/packages/d6/b2/288b3579ffc07e92af66e2f1a11be3b056fe1214aab314748461f21a31c3/pydantic_core-2.33.1-cp312-cp312-win32.whl", hash = "sha256:e14f369c98a7c15772b9da98987f58e2b509a93235582838bd0d1d8c08b68fda", size = 1907034, upload_time = "2025-04-02T09:47:46.843Z" }, + { url = "https://files.pythonhosted.org/packages/02/28/58442ad1c22b5b6742b992ba9518420235adced665513868f99a1c2638a5/pydantic_core-2.33.1-cp312-cp312-win_amd64.whl", hash = "sha256:1c607801d85e2e123357b3893f82c97a42856192997b95b4d8325deb1cd0c5f4", size = 1956848, upload_time = "2025-04-02T09:47:48.404Z" }, + { url = "https://files.pythonhosted.org/packages/a1/eb/f54809b51c7e2a1d9f439f158b8dd94359321abcc98767e16fc48ae5a77e/pydantic_core-2.33.1-cp312-cp312-win_arm64.whl", hash = "sha256:8d13f0276806ee722e70a1c93da19748594f19ac4299c7e41237fc791d1861ea", size = 1903986, upload_time = "2025-04-02T09:47:49.839Z" }, + { url = "https://files.pythonhosted.org/packages/7a/24/eed3466a4308d79155f1cdd5c7432c80ddcc4530ba8623b79d5ced021641/pydantic_core-2.33.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:70af6a21237b53d1fe7b9325b20e65cbf2f0a848cf77bed492b029139701e66a", size = 2033551, upload_time = "2025-04-02T09:47:51.648Z" }, + { url = "https://files.pythonhosted.org/packages/ab/14/df54b1a0bc9b6ded9b758b73139d2c11b4e8eb43e8ab9c5847c0a2913ada/pydantic_core-2.33.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:282b3fe1bbbe5ae35224a0dbd05aed9ccabccd241e8e6b60370484234b456266", size = 1852785, upload_time = "2025-04-02T09:47:53.149Z" }, + { url = "https://files.pythonhosted.org/packages/fa/96/e275f15ff3d34bb04b0125d9bc8848bf69f25d784d92a63676112451bfb9/pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b315e596282bbb5822d0c7ee9d255595bd7506d1cb20c2911a4da0b970187d3", size = 1897758, upload_time = "2025-04-02T09:47:55.006Z" }, + { url = "https://files.pythonhosted.org/packages/b7/d8/96bc536e975b69e3a924b507d2a19aedbf50b24e08c80fb00e35f9baaed8/pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dfae24cf9921875ca0ca6a8ecb4bb2f13c855794ed0d468d6abbec6e6dcd44a", size = 1986109, upload_time = "2025-04-02T09:47:56.532Z" }, + { url = "https://files.pythonhosted.org/packages/90/72/ab58e43ce7e900b88cb571ed057b2fcd0e95b708a2e0bed475b10130393e/pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6dd8ecfde08d8bfadaea669e83c63939af76f4cf5538a72597016edfa3fad516", size = 2129159, upload_time = "2025-04-02T09:47:58.088Z" }, + { url = "https://files.pythonhosted.org/packages/dc/3f/52d85781406886c6870ac995ec0ba7ccc028b530b0798c9080531b409fdb/pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f593494876eae852dc98c43c6f260f45abdbfeec9e4324e31a481d948214764", size = 2680222, upload_time = "2025-04-02T09:47:59.591Z" }, + { url = "https://files.pythonhosted.org/packages/f4/56/6e2ef42f363a0eec0fd92f74a91e0ac48cd2e49b695aac1509ad81eee86a/pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:948b73114f47fd7016088e5186d13faf5e1b2fe83f5e320e371f035557fd264d", size = 2006980, upload_time = "2025-04-02T09:48:01.397Z" }, + { url = "https://files.pythonhosted.org/packages/4c/c0/604536c4379cc78359f9ee0aa319f4aedf6b652ec2854953f5a14fc38c5a/pydantic_core-2.33.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e11f3864eb516af21b01e25fac915a82e9ddad3bb0fb9e95a246067398b435a4", size = 2120840, upload_time = "2025-04-02T09:48:03.056Z" }, + { url = "https://files.pythonhosted.org/packages/1f/46/9eb764814f508f0edfb291a0f75d10854d78113fa13900ce13729aaec3ae/pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:549150be302428b56fdad0c23c2741dcdb5572413776826c965619a25d9c6bde", size = 2072518, upload_time = "2025-04-02T09:48:04.662Z" }, + { url = "https://files.pythonhosted.org/packages/42/e3/fb6b2a732b82d1666fa6bf53e3627867ea3131c5f39f98ce92141e3e3dc1/pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:495bc156026efafd9ef2d82372bd38afce78ddd82bf28ef5276c469e57c0c83e", size = 2248025, upload_time = "2025-04-02T09:48:06.226Z" }, + { url = "https://files.pythonhosted.org/packages/5c/9d/fbe8fe9d1aa4dac88723f10a921bc7418bd3378a567cb5e21193a3c48b43/pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ec79de2a8680b1a67a07490bddf9636d5c2fab609ba8c57597e855fa5fa4dacd", size = 2254991, upload_time = "2025-04-02T09:48:08.114Z" }, + { url = "https://files.pythonhosted.org/packages/aa/99/07e2237b8a66438d9b26482332cda99a9acccb58d284af7bc7c946a42fd3/pydantic_core-2.33.1-cp313-cp313-win32.whl", hash = "sha256:ee12a7be1742f81b8a65b36c6921022301d466b82d80315d215c4c691724986f", size = 1915262, upload_time = "2025-04-02T09:48:09.708Z" }, + { url = "https://files.pythonhosted.org/packages/8a/f4/e457a7849beeed1e5defbcf5051c6f7b3c91a0624dd31543a64fc9adcf52/pydantic_core-2.33.1-cp313-cp313-win_amd64.whl", hash = "sha256:ede9b407e39949d2afc46385ce6bd6e11588660c26f80576c11c958e6647bc40", size = 1956626, upload_time = "2025-04-02T09:48:11.288Z" }, + { url = "https://files.pythonhosted.org/packages/20/d0/e8d567a7cff7b04e017ae164d98011f1e1894269fe8e90ea187a3cbfb562/pydantic_core-2.33.1-cp313-cp313-win_arm64.whl", hash = "sha256:aa687a23d4b7871a00e03ca96a09cad0f28f443690d300500603bd0adba4b523", size = 1909590, upload_time = "2025-04-02T09:48:12.861Z" }, + { url = "https://files.pythonhosted.org/packages/ef/fd/24ea4302d7a527d672c5be06e17df16aabfb4e9fdc6e0b345c21580f3d2a/pydantic_core-2.33.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:401d7b76e1000d0dd5538e6381d28febdcacb097c8d340dde7d7fc6e13e9f95d", size = 1812963, upload_time = "2025-04-02T09:48:14.553Z" }, + { url = "https://files.pythonhosted.org/packages/5f/95/4fbc2ecdeb5c1c53f1175a32d870250194eb2fdf6291b795ab08c8646d5d/pydantic_core-2.33.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7aeb055a42d734c0255c9e489ac67e75397d59c6fbe60d155851e9782f276a9c", size = 1986896, upload_time = "2025-04-02T09:48:16.222Z" }, + { url = "https://files.pythonhosted.org/packages/71/ae/fe31e7f4a62431222d8f65a3bd02e3fa7e6026d154a00818e6d30520ea77/pydantic_core-2.33.1-cp313-cp313t-win_amd64.whl", hash = "sha256:338ea9b73e6e109f15ab439e62cb3b78aa752c7fd9536794112e14bee02c8d18", size = 1931810, upload_time = "2025-04-02T09:48:17.97Z" }, + { url = "https://files.pythonhosted.org/packages/0b/76/1794e440c1801ed35415238d2c728f26cd12695df9057154ad768b7b991c/pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3a371dc00282c4b84246509a5ddc808e61b9864aa1eae9ecc92bb1268b82db4a", size = 2042858, upload_time = "2025-04-02T09:49:03.419Z" }, + { url = "https://files.pythonhosted.org/packages/73/b4/9cd7b081fb0b1b4f8150507cd59d27b275c3e22ad60b35cb19ea0977d9b9/pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f59295ecc75a1788af8ba92f2e8c6eeaa5a94c22fc4d151e8d9638814f85c8fc", size = 1873745, upload_time = "2025-04-02T09:49:05.391Z" }, + { url = "https://files.pythonhosted.org/packages/e1/d7/9ddb7575d4321e40d0363903c2576c8c0c3280ebea137777e5ab58d723e3/pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08530b8ac922003033f399128505f513e30ca770527cc8bbacf75a84fcc2c74b", size = 1904188, upload_time = "2025-04-02T09:49:07.352Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a8/3194ccfe461bb08da19377ebec8cb4f13c9bd82e13baebc53c5c7c39a029/pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bae370459da6a5466978c0eacf90690cb57ec9d533f8e63e564ef3822bfa04fe", size = 2083479, upload_time = "2025-04-02T09:49:09.304Z" }, + { url = "https://files.pythonhosted.org/packages/42/c7/84cb569555d7179ca0b3f838cef08f66f7089b54432f5b8599aac6e9533e/pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e3de2777e3b9f4d603112f78006f4ae0acb936e95f06da6cb1a45fbad6bdb4b5", size = 2118415, upload_time = "2025-04-02T09:49:11.25Z" }, + { url = "https://files.pythonhosted.org/packages/3b/67/72abb8c73e0837716afbb58a59cc9e3ae43d1aa8677f3b4bc72c16142716/pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a64e81e8cba118e108d7126362ea30e021291b7805d47e4896e52c791be2761", size = 2079623, upload_time = "2025-04-02T09:49:13.292Z" }, + { url = "https://files.pythonhosted.org/packages/0b/cd/c59707e35a47ba4cbbf153c3f7c56420c58653b5801b055dc52cccc8e2dc/pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:52928d8c1b6bda03cc6d811e8923dffc87a2d3c8b3bfd2ce16471c7147a24850", size = 2250175, upload_time = "2025-04-02T09:49:15.597Z" }, + { url = "https://files.pythonhosted.org/packages/84/32/e4325a6676b0bed32d5b084566ec86ed7fd1e9bcbfc49c578b1755bde920/pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1b30d92c9412beb5ac6b10a3eb7ef92ccb14e3f2a8d7732e2d739f58b3aa7544", size = 2254674, upload_time = "2025-04-02T09:49:17.61Z" }, + { url = "https://files.pythonhosted.org/packages/12/6f/5596dc418f2e292ffc661d21931ab34591952e2843e7168ea5a52591f6ff/pydantic_core-2.33.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f995719707e0e29f0f41a8aa3bcea6e761a36c9136104d3189eafb83f5cec5e5", size = 2080951, upload_time = "2025-04-02T09:49:19.559Z" }, ] [[package]] @@ -2146,16 +2208,16 @@ wheels = [ [[package]] name = "rich-toolkit" -version = "0.14.5" +version = "0.14.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, { name = "rich" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d1/24/f0678256fbe0643b4ba00a460f4b736ef07042e459f8d4087c8b7011ab81/rich_toolkit-0.14.5.tar.gz", hash = "sha256:1cb7a3fa0bdbf35793460708664f3f797e8b18cedec9cd41a7e6125e4bc6272b", size = 104799, upload_time = "2025-05-05T10:19:24.521Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/31/b6d055f291a660a7bcaec4bcc9457b9fef8ecb6293e527b1eef1840aefd4/rich_toolkit-0.14.6.tar.gz", hash = "sha256:9dbd40e83414b84e828bf899115fff8877ce5951b73175f44db142902f07645d", size = 110805, upload_time = "2025-05-12T19:19:15.284Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/13/621cc551b72de51e6e5cb7cfc510a141e1858bd380ee3c8108fbda4a6be0/rich_toolkit-0.14.5-py3-none-any.whl", hash = "sha256:2fe9846ecbf5d0cdf236c7f43452b68d9da1436a81594aba6b79b3c48b05703b", size = 24791, upload_time = "2025-05-05T10:19:23.346Z" }, + { url = "https://files.pythonhosted.org/packages/2e/3c/7a824c0514e87c61000583ac22c8321da6dc8e58a93d5f56e583482a2ee0/rich_toolkit-0.14.6-py3-none-any.whl", hash = "sha256:764f3a5f9e4b539ce805596863299e8982599514906dc5e3ccc2d390ef74c301", size = 24815, upload_time = "2025-05-12T19:19:13.713Z" }, ] [[package]] @@ -2194,11 +2256,11 @@ wheels = [ [[package]] name = "setuptools" -version = "80.3.1" +version = "80.4.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/70/dc/3976b322de9d2e87ed0007cf04cc7553969b6c7b3f48a565d0333748fbcd/setuptools-80.3.1.tar.gz", hash = "sha256:31e2c58dbb67c99c289f51c16d899afedae292b978f8051efaf6262d8212f927", size = 1315082, upload_time = "2025-05-04T18:47:04.397Z" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/0cc40fe41fd2adb80a2f388987f4f8db3c866c69e33e0b4c8b093fdf700e/setuptools-80.4.0.tar.gz", hash = "sha256:5a78f61820bc088c8e4add52932ae6b8cf423da2aff268c23f813cfbb13b4006", size = 1315008, upload_time = "2025-05-09T20:42:27.972Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/7e/5d8af3317ddbf9519b687bd1c39d8737fde07d97f54df65553faca5cffb1/setuptools-80.3.1-py3-none-any.whl", hash = "sha256:ea8e00d7992054c4c592aeb892f6ad51fe1b4d90cc6947cc45c45717c40ec537", size = 1201172, upload_time = "2025-05-04T18:47:02.575Z" }, + { url = "https://files.pythonhosted.org/packages/b1/93/dba5ed08c2e31ec7cdc2ce75705a484ef0be1a2fecac8a58272489349de8/setuptools-80.4.0-py3-none-any.whl", hash = "sha256:6cdc8cb9a7d590b237dbe4493614a9b75d0559b888047c1f67d49ba50fc3edb2", size = 1200812, upload_time = "2025-05-09T20:42:25.325Z" }, ] [[package]] @@ -2529,6 +2591,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload_time = "2025-04-10T14:19:03.967Z" }, ] +[[package]] +name = "typing-inspection" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/5c/e6082df02e215b846b4b8c0b887a64d7d08ffaba30605502639d44c06b82/typing_inspection-0.4.0.tar.gz", hash = "sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122", size = 76222, upload_time = "2025-02-25T17:27:59.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/08/aa4fdfb71f7de5176385bd9e90852eaf6b5d622735020ad600f2bab54385/typing_inspection-0.4.0-py3-none-any.whl", hash = "sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f", size = 14125, upload_time = "2025-02-25T17:27:57.754Z" }, +] + [[package]] name = "tzdata" version = "2025.2" From b4a445f7628a056855595c50f2753aeecb857f24 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 13 May 2025 11:01:31 -0400 Subject: [PATCH 182/182] feat(app): Change metrics endpoints from per week to per month --- collector_db/AsyncDatabaseClient.py | 44 ++++++++++--------- core/DTOs/GetMetricsBacklogResponse.py | 17 +++++-- ...tMetricsURLsBreakdownPendingResponseDTO.py | 14 +++++- ...etricsURLsBreakdownSubmittedResponseDTO.py | 17 +++++-- .../integration/api/test_metrics.py | 28 ++++++------ 5 files changed, 77 insertions(+), 43 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 03c652c9..ac6216d6 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -2001,14 +2001,14 @@ async def get_urls_breakdown_submitted_metrics( ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: # Build the query - week = func.date_trunc('week', URLDataSource.created_at) + month = func.date_trunc('month', URLDataSource.created_at) query = ( select( - week.label('week'), + month.label('month'), func.count(URLDataSource.id).label('count_submitted'), ) - .group_by(week) - .order_by(week.asc()) + .group_by(month) + .order_by(month.asc()) ) # Execute the query @@ -2017,7 +2017,7 @@ async def get_urls_breakdown_submitted_metrics( final_results: list[GetMetricsURLsBreakdownSubmittedInnerDTO] = [] for result in results: dto = GetMetricsURLsBreakdownSubmittedInnerDTO( - week_of=result.week, + month=result.month.strftime("%B %Y"), count_submitted=result.count_submitted ) final_results.append(dto) @@ -2111,12 +2111,12 @@ async def get_urls_breakdown_pending_metrics( ).cte("flags") - week = func.date_trunc('week', URL.created_at) + month = func.date_trunc('month', URL.created_at) # Build the query query = ( select( - week.label('week'), + month.label('month'), func.count(URL.id).label('count_total'), func.count(case( (flags.c.has_user_record_type_annotation == True, 1)) @@ -2130,8 +2130,8 @@ async def get_urls_breakdown_pending_metrics( ) .outerjoin(flags, flags.c.url_id == URL.id) .where(URL.outcome == URLStatus.PENDING.value) - .group_by(week) - .order_by(week.asc()) + .group_by(month) + .order_by(month.asc()) ) # Execute the query and return the results @@ -2141,7 +2141,7 @@ async def get_urls_breakdown_pending_metrics( for result in all_results: dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( - week_created_at=result.week, + month=result.month.strftime("%B %Y"), count_pending_total=result.count_total, count_pending_relevant_user=result.user_relevant_count, count_pending_record_type_user=result.user_record_type_count, @@ -2157,16 +2157,18 @@ async def get_backlog_metrics( self, session: AsyncSession ) -> GetMetricsBacklogResponseDTO: - # 1. Create a subquery that assigns row_number() partitioned by week - weekly_snapshots_subq = ( + month = func.date_trunc('month', BacklogSnapshot.created_at) + + # 1. Create a subquery that assigns row_number() partitioned by month + monthly_snapshot_subq = ( select( BacklogSnapshot.id, BacklogSnapshot.created_at, BacklogSnapshot.count_pending_total, - func.date_trunc('week', BacklogSnapshot.created_at).label("week_start"), + month.label("month_start"), func.row_number() .over( - partition_by=func.date_trunc('week', BacklogSnapshot.created_at), + partition_by=month, order_by=BacklogSnapshot.created_at.desc() ) .label("row_number") @@ -2174,15 +2176,15 @@ async def get_backlog_metrics( .subquery() ) - # 2. Filter for the top (most recent) row in each week + # 2. Filter for the top (most recent) row in each month stmt = ( select( - weekly_snapshots_subq.c.week_start, - weekly_snapshots_subq.c.created_at, - weekly_snapshots_subq.c.count_pending_total + monthly_snapshot_subq.c.month_start, + monthly_snapshot_subq.c.created_at, + monthly_snapshot_subq.c.count_pending_total ) - .where(weekly_snapshots_subq.c.row_number == 1) - .order_by(weekly_snapshots_subq.c.week_start) + .where(monthly_snapshot_subq.c.row_number == 1) + .order_by(monthly_snapshot_subq.c.month_start) ) raw_result = await session.execute(stmt) @@ -2191,7 +2193,7 @@ async def get_backlog_metrics( for result in results: final_results.append( GetMetricsBacklogResponseInnerDTO( - week_of=result.week_start, + month=result.month_start.strftime("%B %Y"), count_pending_total=result.count_pending_total, ) ) diff --git a/core/DTOs/GetMetricsBacklogResponse.py b/core/DTOs/GetMetricsBacklogResponse.py index 0df38324..8193e385 100644 --- a/core/DTOs/GetMetricsBacklogResponse.py +++ b/core/DTOs/GetMetricsBacklogResponse.py @@ -1,10 +1,21 @@ -import datetime +from datetime import datetime + +from pydantic import BaseModel, field_validator -from pydantic import BaseModel class GetMetricsBacklogResponseInnerDTO(BaseModel): - week_of: datetime.date + month: str count_pending_total: int + @field_validator("month") + @classmethod + def validate_month_format(cls, v: str) -> str: + try: + # This will raise ValueError if format doesn't match + datetime.strptime(v, "%B %Y") + except ValueError: + raise ValueError("month must be in the format 'MonthName YYYY' (e.g., 'May 2025')") + return v + class GetMetricsBacklogResponseDTO(BaseModel): entries: list[GetMetricsBacklogResponseInnerDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py index 22235e45..16e596d5 100644 --- a/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownPendingResponseDTO.py @@ -1,12 +1,22 @@ -from pydantic import BaseModel +from pydantic import BaseModel, field_validator from datetime import datetime class GetMetricsURLsBreakdownPendingResponseInnerDTO(BaseModel): - week_created_at: datetime + month: str count_pending_total: int count_pending_relevant_user: int count_pending_record_type_user: int count_pending_agency_user: int + @field_validator("month") + @classmethod + def validate_month_format(cls, v: str) -> str: + try: + # This will raise ValueError if format doesn't match + datetime.strptime(v, "%B %Y") + except ValueError: + raise ValueError("month must be in the format 'MonthName YYYY' (e.g., 'May 2025')") + return v + class GetMetricsURLsBreakdownPendingResponseDTO(BaseModel): entries: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] \ No newline at end of file diff --git a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py index d5c1dde5..2ac4e768 100644 --- a/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py +++ b/core/DTOs/GetMetricsURLsBreakdownSubmittedResponseDTO.py @@ -1,10 +1,21 @@ -from datetime import date +from datetime import datetime + +from pydantic import BaseModel, field_validator -from pydantic import BaseModel class GetMetricsURLsBreakdownSubmittedInnerDTO(BaseModel): - week_of: date + month: str count_submitted: int + @field_validator("month") + @classmethod + def validate_month_format(cls, v: str) -> str: + try: + # This will raise ValueError if format doesn't match + datetime.strptime(v, "%B %Y") + except ValueError: + raise ValueError("month must be in the format 'MonthName YYYY' (e.g., 'May 2025')") + return v + class GetMetricsURLsBreakdownSubmittedResponseDTO(BaseModel): entries: list[GetMetricsURLsBreakdownSubmittedInnerDTO] \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_metrics.py b/tests/test_automated/integration/api/test_metrics.py index fc45ad0b..b8eb6ca6 100644 --- a/tests/test_automated/integration/api/test_metrics.py +++ b/tests/test_automated/integration/api/test_metrics.py @@ -76,8 +76,8 @@ async def test_get_batches_aggregated_metrics(api_test_helper): @pytest.mark.asyncio async def test_get_batches_breakdown_metrics(api_test_helper): - # Create a different batch for each week, with different URLs - today = pendulum.today() + # Create a different batch for each month, with different URLs + today = pendulum.parse('2021-01-01') ath = api_test_helper batch_1_params = TestBatchCreationParameters( @@ -169,7 +169,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): async def test_get_urls_breakdown_submitted_metrics(api_test_helper): # Create URLs with submitted status, broken down in different amounts by different weeks # And ensure the URLs are - today = pendulum.today() + today = pendulum.parse('2021-01-01') ath = api_test_helper batch_1_params = TestBatchCreationParameters( @@ -234,7 +234,7 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): # with a different number of kinds of annotations per URLs - today = pendulum.today() + today = pendulum.parse('2021-01-01') ath = api_test_helper agency_id = await ath.db_data_creator.agency() @@ -315,7 +315,7 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): @pytest.mark.asyncio async def test_get_urls_aggregate_metrics(api_test_helper): ath = api_test_helper - today = pendulum.today() + today = pendulum.parse('2021-01-01') batch_0_params = TestBatchCreationParameters( strategy=CollectorType.MANUAL, @@ -384,14 +384,14 @@ async def test_get_urls_aggregate_metrics(api_test_helper): @pytest.mark.asyncio async def test_get_backlog_metrics(api_test_helper): - today = pendulum.today() + today = pendulum.parse('2021-01-01') ath = api_test_helper adb_client = ath.adb_client() - # Populate the backlog table and test that backlog metrics returned on a weekly basis - # Ensure that multiple days in each week are added to the backlog table, with different values + # Populate the backlog table and test that backlog metrics returned on a monthly basis + # Ensure that multiple days in each month are added to the backlog table, with different values batch_1_params = TestBatchCreationParameters( @@ -413,11 +413,11 @@ async def test_get_backlog_metrics(api_test_helper): batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) await adb_client.populate_backlog_snapshot( - dt=today.subtract(weeks=3).naive() + dt=today.subtract(months=3).naive() ) await adb_client.populate_backlog_snapshot( - dt=today.subtract(weeks=2, days=3).naive() + dt=today.subtract(months=2, days=3).naive() ) batch_2_params = TestBatchCreationParameters( @@ -439,11 +439,11 @@ async def test_get_backlog_metrics(api_test_helper): batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) await adb_client.populate_backlog_snapshot( - dt=today.subtract(weeks=2).naive() + dt=today.subtract(months=2).naive() ) await adb_client.populate_backlog_snapshot( - dt=today.subtract(weeks=1, days=4).naive() + dt=today.subtract(months=1, days=4).naive() ) batch_3_params = TestBatchCreationParameters( @@ -465,14 +465,14 @@ async def test_get_backlog_metrics(api_test_helper): batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) await adb_client.populate_backlog_snapshot( - dt=today.subtract(weeks=1).naive() + dt=today.subtract(months=1).naive() ) dto = await ath.request_validator.get_backlog_metrics() assert len(dto.entries) == 3 - # Test that the count closest to the beginning of the week is returned for each week + # Test that the count closest to the beginning of the month is returned for each month assert dto.entries[0].count_pending_total == 1 assert dto.entries[1].count_pending_total == 5 assert dto.entries[2].count_pending_total == 12 \ No newline at end of file