From 124ca7d095f8dd6efec7c6d4660ea33619343da3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 19 Aug 2025 09:31:13 -0400 Subject: [PATCH 01/33] Add test environment variable for INTERNET_ARCHIVE_S3_KEYS --- ...65a1431_augment_auto_agency_suggestions.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py new file mode 100644 index 00000000..801af52f --- /dev/null +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -0,0 +1,100 @@ +"""Augment auto_agency_suggestions + +Revision ID: b741b65a1431 +Revises: 8a70ee509a74 +Create Date: 2025-08-19 08:03:12.106575 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import created_at_column, updated_at_column + +# revision identifiers, used by Alembic. +revision: str = 'b741b65a1431' +down_revision: Union[str, None] = '8a70ee509a74' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "automated_url_agency_suggestions" +NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "url_auto_agency_suggestions" + +OLD_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agencies" +NEW_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agency" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.Enum( + "homepage_match", + "nlp_location_match", + "muckrock_match", + "ckan_match", + "unknown", + name="agency_auto_suggestion_method" +) + +def upgrade() -> None: + op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME) + _alter_auto_agency_suggestions_table() + +def _alter_auto_agency_suggestions_table(): + # Created At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + created_at_column() + ) + # Updated At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + updated_at_column() + ) + # Method + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column('method', AGENCY_AUTO_SUGGESTION_METHOD_ENUM, default="unknown", nullable=False) + ) + # Confidence + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'confidence', + sa.Float(), + default=0.0, + nullable=False + ) + ) + # Check constraint that confidence is between 0 and 1 + op.create_check_constraint( + "auto_url_agency_suggestions_check_confidence_between_0_and_1", + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + "confidence BETWEEN 0 AND 1" + ) + + +def _revert_auto_agency_suggestions_table(): + # Created At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'created_at' + ) + # Updated At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'updated_at' + ) + # Method + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'method' + ) + # Confidence + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'confidence' + ) + +def downgrade() -> None: + op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME) + _revert_auto_agency_suggestions_table() From aa1822f22cf754bab9024aa548f9e01129a63d2f Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Aug 2025 08:54:24 -0400 Subject: [PATCH 02/33] Continue draft --- src/db/models/impl/url/suggestion/agency/auto.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/db/models/impl/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py index 5ecfdf0a..6d6710c4 100644 --- a/src/db/models/impl/url/suggestion/agency/auto.py +++ b/src/db/models/impl/url/suggestion/agency/auto.py @@ -1,16 +1,19 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint +from sqlalchemy import Column, Boolean, UniqueConstraint, Float from sqlalchemy.orm import relationship from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.standard import StandardBase from src.db.models.templates_.with_id import WithIDBase -class AutomatedUrlAgencySuggestion(URLDependentMixin, WithIDBase): +class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): __tablename__ = "automated_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) is_unknown = Column(Boolean, nullable=True) + confidence = Column(Float, nullable=False) + agency = relationship("Agency", back_populates="automated_suggestions") url = relationship("URL", back_populates="automated_agency_suggestions") From e32c8ececbad8221115d423da1325c6be0d78e1d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 24 Aug 2025 21:17:06 -0400 Subject: [PATCH 03/33] Progress draft --- ...65a1431_augment_auto_agency_suggestions.py | 160 +++++++++- .../queries/get_annotation_batch_info.py | 4 +- .../get_next_url_for_user_annotation.py | 4 +- .../agency/get/queries/next_for_annotation.py | 4 +- src/api/endpoints/annotate/all/get/query.py | 4 +- src/api/endpoints/batch/duplicates/query.py | 2 +- src/api/endpoints/batch/urls/query.py | 2 +- src/api/endpoints/collector/manual/query.py | 4 +- .../endpoints/metrics/backlog}/__init__.py | 0 src/api/endpoints/metrics/backlog/query.py | 53 ++++ .../metrics/batches/aggregated/query.py | 117 ------- .../batches/aggregated/query}/__init__.py | 0 .../aggregated/query/all_urls}/__init__.py | 0 .../aggregated/query/all_urls/query.py | 28 ++ .../query/batch_status_}/__init__.py | 0 .../aggregated/query/batch_status_/query.py | 37 +++ .../query/batch_status_/response.py | 10 + .../metrics/batches/aggregated/query/core.py | 79 +++++ .../aggregated/query/models/__init__.py | 0 .../aggregated/query/models/strategy_count.py | 8 + .../aggregated/query/pending/__init__.py | 0 .../batches/aggregated/query/pending/query.py | 37 +++ .../aggregated/query/rejected/__init__.py | 0 .../aggregated/query/rejected/query.py | 39 +++ .../aggregated/query/requester_/__init__.py | 0 .../aggregated/query/requester_/convert.py | 11 + .../aggregated/query/requester_/requester.py | 75 +++++ .../aggregated/query/submitted_/__init__.py | 0 .../aggregated/query/submitted_/query.py | 45 +++ .../aggregated/query/url_error/__init__.py | 0 .../aggregated/query/url_error/query.py | 34 ++ .../aggregated/query/validated_/__init__.py | 0 .../aggregated/query/validated_/query.py | 38 +++ .../batches/breakdown/error/__init__.py | 0 .../metrics/batches/breakdown/error/cte_.py | 25 ++ .../breakdown/not_relevant/__init__.py | 0 .../batches/breakdown/not_relevant/cte_.py | 27 ++ .../batches/breakdown/pending/__init__.py | 0 .../metrics/batches/breakdown/pending/cte_.py | 26 ++ .../metrics/batches/breakdown/query.py | 55 ++-- .../batches/breakdown/submitted/__init__.py | 0 .../batches/breakdown/submitted/cte_.py | 23 ++ .../batches/breakdown/templates/__init__.py | 0 .../batches/breakdown/templates/cte_.py | 20 ++ .../batches/breakdown/total/__init__.py | 0 .../metrics/batches/breakdown/total/cte_.py | 15 + .../batches/breakdown/validated/__init__.py | 0 .../batches/breakdown/validated/cte_.py | 23 ++ src/api/endpoints/metrics/urls/__init__.py | 0 .../metrics/urls/aggregated/__init__.py | 0 .../metrics/urls/aggregated/query/__init__.py | 0 .../metrics/urls/aggregated/query/core.py | 48 +++ .../aggregated/query/subqueries/__init__.py | 0 .../urls/aggregated/query/subqueries/all.py | 9 + .../urls/aggregated/query/subqueries/error.py | 11 + .../aggregated/query/subqueries/pending.py | 19 ++ .../aggregated/query/subqueries/rejected.py | 18 ++ .../aggregated/query/subqueries/submitted.py | 14 + .../aggregated/query/subqueries/validated.py | 14 + .../metrics/urls/breakdown/__init__.py | 0 .../metrics/urls/breakdown/query/__init__.py | 0 .../metrics/urls/breakdown/query/core.py | 91 ++++++ .../endpoints/review/approve/query_/core.py | 161 ++++++---- src/api/endpoints/review/next/query.py | 25 +- src/api/endpoints/review/reject/query.py | 14 +- src/collectors/enums.py | 6 +- src/collectors/queries/insert/url.py | 2 +- src/core/core.py | 2 +- src/core/enums.py | 11 + src/core/exceptions.py | 1 + .../scheduled/impl/huggingface/operator.py | 31 +- .../huggingface/queries/check/requester.py | 14 +- .../impl/huggingface/queries/get/convert.py | 18 +- .../impl/huggingface/queries/get/core.py | 35 ++- .../impl/huggingface/queries/get/mappings.py | 6 - .../data_sources/queries/upsert/convert.py | 24 ++ .../sync/data_sources/queries/upsert/core.py | 55 ++-- .../queries/upsert/helpers/convert.py | 6 +- .../queries/upsert/param_manager.py | 25 ++ .../data_sources/queries/upsert/requester.py | 6 +- ...pending_urls_without_agency_suggestions.py | 4 +- .../has_urls_without_agency_suggestions.py | 2 +- .../auto_relevant/queries/get_tdos.py | 2 +- src/db/client/async_.py | 294 ++++-------------- src/db/client/sync.py | 20 +- src/db/helpers/session/session_helper.py | 10 +- src/db/models/impl/batch/pydantic/__init__.py | 0 .../batch/{pydantic.py => pydantic/info.py} | 0 src/db/models/impl/batch/pydantic/insert.py | 17 + .../impl/flag/url_validated/__init__.py | 0 .../models/impl/flag/url_validated/enums.py | 8 + .../impl/flag/url_validated/pydantic.py | 22 ++ .../impl/flag/url_validated/sqlalchemy.py | 25 ++ src/db/models/impl/link/batch_url/__init__.py | 0 src/db/models/impl/link/batch_url/pydantic.py | 11 + .../{batch_url.py => batch_url/sqlalchemy.py} | 0 .../models/impl/link/url_agency/sqlalchemy.py | 2 +- src/db/models/impl/url/core/pydantic/info.py | 2 +- .../models/impl/url/core/pydantic/insert.py | 2 +- .../models/impl/url/suggestion/agency/auto.py | 2 +- .../core/common/annotation_exists.py | 10 +- .../get/recent_batch_summaries/builder.py | 11 +- .../pending_url/__init__.py | 0 .../recent_batch_summaries/pending_url/cte.py | 30 ++ .../url_counts/builder.py | 75 ++--- .../url_counts/cte/__init__.py | 0 .../url_counts/cte/all.py | 20 ++ .../url_counts/cte/duplicate.py | 29 ++ .../url_counts/cte/error.py | 29 ++ .../url_counts/cte/not_relevant.py | 34 ++ .../url_counts/cte/pending.py | 33 ++ .../url_counts/cte/submitted.py | 32 ++ .../url_counts/cte_container.py | 18 ++ .../core/metrics/urls/aggregated/pending.py | 2 +- src/db/statement_composer.py | 5 +- src/db/templates/requester.py | 15 + src/util/alembic_helpers.py | 19 +- .../integration/api/batch/__init__.py | 0 .../api/batch/summaries/__init__.py | 0 .../api/batch/summaries/test_happy_path.py | 95 ++++++ .../summaries/test_pending_url_filter.py | 72 +++++ .../integration/api/batch/test_batch.py | 64 ++++ .../api/example_collector/test_happy_path.py | 2 +- .../api/metrics/batches/test_aggregated.py | 79 +++-- .../api/metrics/batches/test_breakdown.py | 110 ++++--- .../integration/api/metrics/test_backlog.py | 83 ++--- .../api/metrics/urls/aggregated/test_core.py | 73 ++--- .../metrics/urls/breakdown/test_pending.py | 13 +- .../metrics/urls/breakdown/test_submitted.py | 13 +- .../integration/api/review/conftest.py | 18 +- .../rejection/test_individual_record.py | 11 +- .../api/review/rejection/test_not_relevant.py | 8 +- .../test_approve_and_get_next_source.py | 9 +- .../api/review/test_batch_filtering.py | 18 +- tests/automated/integration/api/test_batch.py | 237 -------------- .../integration/api/test_manual_batch.py | 2 +- .../integration/db/client/test_insert_urls.py | 4 +- .../scheduled/impl/huggingface/setup/check.py | 30 ++ .../scheduled/impl/huggingface/setup/data.py | 95 ++---- .../scheduled/impl/huggingface/setup/enums.py | 7 + .../impl/huggingface/setup/helper.py | 16 + .../impl/huggingface/setup/manager.py | 43 --- .../impl/huggingface/setup/models/entry.py | 12 - .../impl/huggingface/setup/models/input.py | 5 +- .../impl/huggingface/setup/models/output.py | 21 -- .../impl/huggingface/setup/models/record.py | 11 - .../impl/huggingface/setup/queries/convert.py | 14 + .../impl/huggingface/setup/queries/setup.py | 57 ++-- .../impl/huggingface/test_happy_path.py | 42 --- .../test_no_html_content_not_picked_up.py | 45 +++ .../test_not_relevant_picked_up.py | 58 ++++ .../test_not_validated_not_picked_up.py | 44 +++ .../huggingface/test_validated_picked_up.py | 60 ++++ .../scheduled/impl/sync/data_sources/check.py | 11 +- .../impl/sync/data_sources/conftest.py | 33 +- .../sync/data_sources/existence_checker.py | 42 --- .../impl/sync/data_sources/setup/core.py | 78 ++++- .../impl/sync/data_sources/setup/data.py | 100 ------ .../impl/sync/data_sources/setup/enums.py | 16 - .../sync/data_sources/setup/manager/agency.py | 31 -- .../sync/data_sources/setup/manager/core.py | 111 ------- .../setup/manager/queries/check.py | 46 --- .../sync/data_sources/setup/manager/url.py | 97 ------ .../data_sources/setup/models/url/core.py | 14 - .../setup/models/url/data_sources.py | 20 -- .../data_sources/setup/models/url/post.py | 50 --- .../setup/models/url/source_collector.py | 17 - .../data_sources/setup/queries/__init__.py | 0 .../setup/queries/url_/__init__.py | 0 .../setup/queries/url_/requester.py | 59 ++++ .../data_sources/setup/queries/url_/url.py | 35 +++ .../impl/sync/data_sources/test_db_only.py | 76 +++++ .../impl/sync/data_sources/test_happy_path.py | 62 ---- .../sync/data_sources/test_interruption.py | 108 ++++--- .../sync/data_sources/test_multiple_calls.py | 107 +++++++ .../sync/data_sources/test_no_new_results.py | 59 ---- .../data_sources/test_url_broken_approved.py | 85 +++++ .../test_url_in_db_overwritten_by_ds.py | 94 ++++++ .../sync/data_sources/test_url_ok_approved.py | 63 ++++ .../happy_path/test_happy_path.py | 8 +- .../tasks/url/impl/auto_relevant/test_task.py | 2 +- .../html/mocks/url_request_interface/setup.py | 16 +- .../tasks/url/impl/html/setup/data.py | 6 +- .../url/impl/probe/no_redirect/test_ok.py | 4 +- .../impl/probe/no_redirect/test_two_urls.py | 2 +- .../probe/redirect/dest_new/test_dest_ok.py | 6 +- .../probe/redirect/test_dest_exists_in_db.py | 4 +- .../probe/redirect/test_redirect_infinite.py | 4 +- .../probe/redirect/test_two_urls_same_dest.py | 8 +- .../tasks/url/impl/test_url_404_probe.py | 8 +- .../helpers/batch_creation_parameters/core.py | 4 +- .../batch_creation_parameters/enums.py | 11 + .../url_creation_parameters.py | 11 +- tests/helpers/counter.py | 7 + .../data_creator/commands/impl/batch.py | 2 +- .../commands/impl/urls_/__init__.py | 0 .../commands/impl/urls_/convert.py | 36 +++ .../commands/impl/{urls.py => urls_/query.py} | 14 +- .../commands/impl/urls_v2/core.py | 10 +- .../commands/impl/urls_v2/response.py | 3 +- tests/helpers/data_creator/core.py | 111 ++++++- tests/helpers/data_creator/create.py | 71 +++++ tests/helpers/data_creator/generate.py | 80 +++++ tests/helpers/data_creator/insert.py | 10 + .../models/creation_info/batch/v2.py | 4 +- .../data_creator/models/creation_info/url.py | 3 +- tests/helpers/setup/annotation/core.py | 2 +- tests/helpers/simple_test_data_functions.py | 25 +- .../lifecycle/test_auto_googler_lifecycle.py | 2 +- .../core/lifecycle/test_ckan_lifecycle.py | 2 +- .../lifecycle/test_muckrock_lifecycles.py | 2 +- 211 files changed, 3757 insertions(+), 2066 deletions(-) rename {tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager => src/api/endpoints/metrics/backlog}/__init__.py (100%) create mode 100644 src/api/endpoints/metrics/backlog/query.py delete mode 100644 src/api/endpoints/metrics/batches/aggregated/query.py rename {tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries => src/api/endpoints/metrics/batches/aggregated/query}/__init__.py (100%) rename {tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models => src/api/endpoints/metrics/batches/aggregated/query/all_urls}/__init__.py (100%) create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py rename {tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url => src/api/endpoints/metrics/batches/aggregated/query/batch_status_}/__init__.py (100%) create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/core.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/pending/query.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py create mode 100644 src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/error/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/error/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/pending/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/pending/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/templates/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/templates/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/total/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/total/cte_.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/validated/__init__.py create mode 100644 src/api/endpoints/metrics/batches/breakdown/validated/cte_.py create mode 100644 src/api/endpoints/metrics/urls/__init__.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/__init__.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/__init__.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/core.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py create mode 100644 src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py create mode 100644 src/api/endpoints/metrics/urls/breakdown/__init__.py create mode 100644 src/api/endpoints/metrics/urls/breakdown/query/__init__.py create mode 100644 src/api/endpoints/metrics/urls/breakdown/query/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py create mode 100644 src/db/models/impl/batch/pydantic/__init__.py rename src/db/models/impl/batch/{pydantic.py => pydantic/info.py} (100%) create mode 100644 src/db/models/impl/batch/pydantic/insert.py create mode 100644 src/db/models/impl/flag/url_validated/__init__.py create mode 100644 src/db/models/impl/flag/url_validated/enums.py create mode 100644 src/db/models/impl/flag/url_validated/pydantic.py create mode 100644 src/db/models/impl/flag/url_validated/sqlalchemy.py create mode 100644 src/db/models/impl/link/batch_url/__init__.py create mode 100644 src/db/models/impl/link/batch_url/pydantic.py rename src/db/models/impl/link/{batch_url.py => batch_url/sqlalchemy.py} (100%) create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py create mode 100644 src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py create mode 100644 src/db/templates/requester.py create mode 100644 tests/automated/integration/api/batch/__init__.py create mode 100644 tests/automated/integration/api/batch/summaries/__init__.py create mode 100644 tests/automated/integration/api/batch/summaries/test_happy_path.py create mode 100644 tests/automated/integration/api/batch/summaries/test_pending_url_filter.py create mode 100644 tests/automated/integration/api/batch/test_batch.py delete mode 100644 tests/automated/integration/api/test_batch.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py create mode 100644 tests/helpers/batch_creation_parameters/enums.py create mode 100644 tests/helpers/counter.py create mode 100644 tests/helpers/data_creator/commands/impl/urls_/__init__.py create mode 100644 tests/helpers/data_creator/commands/impl/urls_/convert.py rename tests/helpers/data_creator/commands/impl/{urls.py => urls_/query.py} (79%) create mode 100644 tests/helpers/data_creator/create.py create mode 100644 tests/helpers/data_creator/generate.py create mode 100644 tests/helpers/data_creator/insert.py diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py index 801af52f..84db9b19 100644 --- a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -10,7 +10,7 @@ from alembic import op import sqlalchemy as sa -from src.util.alembic_helpers import created_at_column, updated_at_column +from src.util.alembic_helpers import created_at_column, updated_at_column, id_column, url_id_column, switch_enum_type # revision identifiers, used by Alembic. revision: str = 'b741b65a1431' @@ -30,15 +30,157 @@ "muckrock_match", "ckan_match", "unknown", - name="agency_auto_suggestion_method" + name="agency_auto_suggestion_method", ) +FLAG_URL_VALIDATED_TABLE_NAME = "flag_url_validated" + +VALIDATED_URL_TYPE_ENUM = sa.Enum( + "data source", + "meta url", + "not relevant", + "individual record", + name="validated_url_type" +) + + + + def upgrade() -> None: op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME) _alter_auto_agency_suggestions_table() + _create_flag_url_validated_table() + _add_urls_to_flag_url_validated_table() + _remove_validated_and_submitted_url_statuses() + + +def downgrade() -> None: + op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME) + _revert_auto_agency_suggestions_table() + op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + _revert_url_statuses() + _update_validated_and_submitted_url_statuses() + op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME) + _drop_validated_url_type_enum() + +def _remove_validated_and_submitted_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'ok', + 'duplicate', + 'error', + '404 not found', + ], + check_constraints_to_drop=['url_name_not_null_when_validated'], + conversion_mappings={ + 'validated': 'ok', + 'submitted': 'ok', + 'pending': 'ok', + 'not relevant': 'ok', + 'individual record': 'ok' + } + ) + +def _add_urls_to_flag_url_validated_table(): + op.execute(""" + INSERT INTO flag_url_validated (url_id, type) + SELECT + urls.id, + CASE urls.status::text + WHEN 'validated' THEN 'data source' + WHEN 'submitted' THEN 'data source' + ELSE urls.status::text + END::validated_url_type + FROM urls + WHERE urls.status in ('validated', 'submitted', 'individual record', 'not relevant')""") + +def _revert_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'pending', + 'validated', + 'submitted', + 'duplicate', + 'not relevant', + 'error', + '404 not found', + 'individual record' + ], + conversion_mappings={ + 'ok': 'pending', + } + ) + op.create_check_constraint( + "url_name_not_null_when_validated", + "urls", + "(name IS NOT NULL) OR (status <> 'validated'::url_status)" + ) + +def _update_validated_and_submitted_url_statuses(): + op.execute(""" + UPDATE urls + SET status = 'not relevant' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'not relevant' + """) + + op.execute(""" + UPDATE urls + SET status = 'individual record' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'individual record' + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is NULL + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is not NULL + """) + + +def _create_flag_url_validated_table(): + op.create_table( + FLAG_URL_VALIDATED_TABLE_NAME, + id_column(), + url_id_column(), + sa.Column( + 'type', + VALIDATED_URL_TYPE_ENUM, + nullable=False, + ), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_flag_url_validated_url_id') + ) + +def _drop_validated_url_type_enum(): + VALIDATED_URL_TYPE_ENUM.drop(op.get_bind()) def _alter_auto_agency_suggestions_table(): + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.create(op.get_bind()) # Created At op.add_column( NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, @@ -52,7 +194,12 @@ def _alter_auto_agency_suggestions_table(): # Method op.add_column( NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, - sa.Column('method', AGENCY_AUTO_SUGGESTION_METHOD_ENUM, default="unknown", nullable=False) + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + server_default="unknown", + nullable=False + ) ) # Confidence op.add_column( @@ -60,7 +207,7 @@ def _alter_auto_agency_suggestions_table(): sa.Column( 'confidence', sa.Float(), - default=0.0, + server_default=sa.text('0.0'), nullable=False ) ) @@ -93,8 +240,5 @@ def _revert_auto_agency_suggestions_table(): NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, 'confidence' ) + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.drop(op.get_bind()) -def downgrade() -> None: - op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) - op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME) - _revert_auto_agency_suggestions_table() diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 9b3ffdeb..5a56cf32 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -42,7 +42,7 @@ async def run( ) common_where_clause = [ - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, LinkBatchURL.batch_id == self.batch_id, ] diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index a6a5b69d..cce1a969 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -43,7 +43,7 @@ async def run(self, session: AsyncSession): query = ( query - .where(URL.status == URLStatus.PENDING.value) + .where(URL.status == URLStatus.OK.value) # URL must not have user suggestion .where( StatementComposer.user_suggestion_not_exists(self.user_suggestion_model_to_exclude) diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 70ae112a..ea0ae85e 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,7 +9,7 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -48,7 +48,7 @@ async def run( # Must not have confirmed agencies query = query.where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ) diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index a2afafd9..dbda0f8b 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -10,7 +10,7 @@ from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -39,7 +39,7 @@ async def run( query .where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, StatementComposer.user_suggestion_not_exists(UserUrlAgencySuggestion), StatementComposer.user_suggestion_not_exists(UserRecordTypeSuggestion), StatementComposer.user_suggestion_not_exists(UserRelevantSuggestion), diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 2d8edff9..b09b6e5d 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -5,7 +5,7 @@ from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.sqlalchemy import Duplicate -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 6a88448f..391a265f 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,7 +1,7 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 12b17ad3..73e3edb8 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -6,7 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -47,7 +47,7 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, - status=URLStatus.PENDING.value, + status=URLStatus.OK.value, record_type=entry.record_type.value if entry.record_type is not None else None, source=URLSource.MANUAL ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py b/src/api/endpoints/metrics/backlog/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py rename to src/api/endpoints/metrics/backlog/__init__.py diff --git a/src/api/endpoints/metrics/backlog/query.py b/src/api/endpoints/metrics/backlog/query.py new file mode 100644 index 00000000..788ef424 --- /dev/null +++ b/src/api/endpoints/metrics/backlog/query.py @@ -0,0 +1,53 @@ +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.db.models.impl.backlog_snapshot import BacklogSnapshot +from src.db.queries.base.builder import QueryBuilderBase + + +class GetBacklogMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsBacklogResponseDTO: + month = func.date_trunc('month', BacklogSnapshot.created_at) + + # 1. Create a subquery that assigns row_number() partitioned by month + monthly_snapshot_subq = ( + select( + BacklogSnapshot.id, + BacklogSnapshot.created_at, + BacklogSnapshot.count_pending_total, + month.label("month_start"), + func.row_number() + .over( + partition_by=month, + order_by=BacklogSnapshot.created_at.desc() + ) + .label("row_number") + ) + .subquery() + ) + + # 2. Filter for the top (most recent) row in each month + stmt = ( + select( + monthly_snapshot_subq.c.month_start, + monthly_snapshot_subq.c.created_at, + monthly_snapshot_subq.c.count_pending_total + ) + .where(monthly_snapshot_subq.c.row_number == 1) + .order_by(monthly_snapshot_subq.c.month_start) + ) + + raw_result = await session.execute(stmt) + results = raw_result.all() + final_results = [] + for result in results: + final_results.append( + GetMetricsBacklogResponseInnerDTO( + month=result.month_start.strftime("%B %Y"), + count_pending_total=result.count_pending_total, + ) + ) + + return GetMetricsBacklogResponseDTO(entries=final_results) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py deleted file mode 100644 index e7de65fb..00000000 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ /dev/null @@ -1,117 +0,0 @@ -from sqlalchemy import case, select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.functions import coalesce - -from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ - GetMetricsBatchesAggregatedInnerResponseDTO -from src.collectors.enums import URLStatus, CollectorType -from src.core.enums import BatchStatus -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> GetMetricsBatchesAggregatedResponseDTO: - sc = StatementComposer - - # First, get all batches broken down by collector type and status - def batch_column(status: BatchStatus, label): - return sc.count_distinct( - case( - ( - Batch.status == status.value, - Batch.id - ) - ), - label=label - ) - - batch_count_subquery = select( - batch_column(BatchStatus.READY_TO_LABEL, label="done_count"), - batch_column(BatchStatus.ERROR, label="error_count"), - Batch.strategy, - ).group_by(Batch.strategy).subquery("batch_count") - - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) - - # Next, count urls - url_count_subquery = select( - Batch.strategy, - url_column(URLStatus.PENDING, label="pending_count"), - url_column(URLStatus.ERROR, label="error_count"), - url_column(URLStatus.VALIDATED, label="validated_count"), - url_column(URLStatus.SUBMITTED, label="submitted_count"), - url_column(URLStatus.NOT_RELEVANT, label="rejected_count"), - - ).join( - LinkBatchURL, - LinkBatchURL.url_id == URL.id - ).outerjoin( - Batch, Batch.id == LinkBatchURL.batch_id - ).group_by( - Batch.strategy - ).subquery("url_count") - - # Combine - query = select( - Batch.strategy, - batch_count_subquery.c.done_count.label("batch_done_count"), - batch_count_subquery.c.error_count.label("batch_error_count"), - coalesce(url_count_subquery.c.pending_count, 0).label("pending_count"), - coalesce(url_count_subquery.c.error_count, 0).label("error_count"), - coalesce(url_count_subquery.c.submitted_count, 0).label("submitted_count"), - coalesce(url_count_subquery.c.rejected_count, 0).label("rejected_count"), - coalesce(url_count_subquery.c.validated_count, 0).label("validated_count") - ).join( - batch_count_subquery, - Batch.strategy == batch_count_subquery.c.strategy - ).outerjoin( - url_count_subquery, - Batch.strategy == url_count_subquery.c.strategy - ) - raw_results = await session.execute(query) - results = raw_results.all() - d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} - for result in results: - d[CollectorType(result.strategy)] = GetMetricsBatchesAggregatedInnerResponseDTO( - count_successful_batches=result.batch_done_count, - count_failed_batches=result.batch_error_count, - count_urls=result.pending_count + result.submitted_count + - result.rejected_count + result.error_count + - result.validated_count, - count_urls_pending=result.pending_count, - count_urls_validated=result.validated_count, - count_urls_submitted=result.submitted_count, - count_urls_rejected=result.rejected_count, - count_urls_errors=result.error_count - ) - - total_batch_query = await session.execute( - select( - sc.count_distinct(Batch.id, label="count") - ) - ) - total_batch_count = total_batch_query.scalars().one_or_none() - if total_batch_count is None: - total_batch_count = 0 - - return GetMetricsBatchesAggregatedResponseDTO( - total_batches=total_batch_count, - by_strategy=d - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/__init__.py diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py new file mode 100644 index 00000000..7eed215a --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py @@ -0,0 +1,28 @@ +from typing import Sequence + +from sqlalchemy import func, select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class CountAllURLsByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join(LinkBatchURL) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py new file mode 100644 index 00000000..f8587b68 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import CTE, select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class BatchStatusByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[BatchStatusCountByBatchStrategyResponseDTO]: + query = ( + select( + Batch.strategy, + Batch.status, + func.count(Batch.id).label("count") + ) + .group_by(Batch.strategy, Batch.status) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + results: list[BatchStatusCountByBatchStrategyResponseDTO] = [] + for mapping in mappings: + results.append( + BatchStatusCountByBatchStrategyResponseDTO( + strategy=CollectorType(mapping["strategy"]), + status=BatchStatus(mapping["status"]), + count=mapping["count"] + ) + ) + return results \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py new file mode 100644 index 00000000..79c1b2dd --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus + + +class BatchStatusCountByBatchStrategyResponseDTO(BaseModel): + strategy: CollectorType + status: BatchStatus + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py new file mode 100644 index 00000000..8ffe3753 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -0,0 +1,79 @@ +from sqlalchemy import case, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.sql.functions import coalesce, func + +from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ + GetMetricsBatchesAggregatedInnerResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.requester import \ + GetBatchesAggregatedMetricsQueryRequester +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase +from src.db.statement_composer import StatementComposer + + +class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> GetMetricsBatchesAggregatedResponseDTO: + + requester = GetBatchesAggregatedMetricsQueryRequester(session=session) + + url_error_count_dict: dict[CollectorType, int] = await requester.url_error_by_collector_strategy() + url_pending_count_dict: dict[CollectorType, int] = await requester.pending_url_count_by_collector_strategy() + url_submitted_count_dict: dict[CollectorType, int] = await requester.submitted_url_count_by_collector_strategy() + url_validated_count_dict: dict[CollectorType, int] = await requester.validated_url_count_by_collector_strategy() + url_rejected_count_dict: dict[CollectorType, int] = await requester.rejected_url_count_by_collector_strategy() + url_total_count_dict: dict[CollectorType, int] = await requester.url_count_by_collector_strategy() + batch_status_count_dict: dict[ + CollectorType, + dict[BatchStatus, int] + ] = await requester.batch_status_by_collector_strategy() + + + + + + d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} + for collector_type in CollectorType: + inner_response = GetMetricsBatchesAggregatedInnerResponseDTO( + count_successful_batches=batch_status_count_dict[collector_type][BatchStatus.READY_TO_LABEL], + count_failed_batches=batch_status_count_dict[collector_type][BatchStatus.ERROR], + count_urls=url_total_count_dict[collector_type], + count_urls_pending=url_pending_count_dict[collector_type], + count_urls_validated=url_validated_count_dict[collector_type], + count_urls_submitted=url_submitted_count_dict[collector_type], + count_urls_rejected=url_rejected_count_dict[collector_type], + count_urls_errors=url_error_count_dict[collector_type], + ) + d[collector_type] = inner_response + + total_batch_query = await session.execute( + select( + func.count(Batch.id, label="count") + ) + ) + total_batch_count = total_batch_query.scalars().one_or_none() + if total_batch_count is None: + total_batch_count = 0 + + return GetMetricsBatchesAggregatedResponseDTO( + total_batches=total_batch_count, + by_strategy=d + ) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py new file mode 100644 index 00000000..9ceb7781 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType + + +class CountByBatchStrategyResponse(BaseModel): + strategy: CollectorType + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py new file mode 100644 index 00000000..224d3bad --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class PendingURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.url_id.is_(None)) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py new file mode 100644 index 00000000..d1505f97 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py @@ -0,0 +1,39 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class RejectedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py new file mode 100644 index 00000000..4a129dfb --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py @@ -0,0 +1,11 @@ +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType + + +def convert_strategy_counts_to_strategy_count_dict( + responses: list[CountByBatchStrategyResponse] +) -> dict[CollectorType, int]: + result: dict[CollectorType, int] = {collector_type: 0 for collector_type in CollectorType} + for response in responses: + result[response.strategy] = response.count + return result \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py new file mode 100644 index 00000000..ac4c6dfa --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py @@ -0,0 +1,75 @@ + +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.api.endpoints.metrics.batches.aggregated.query.pending.query import PendingURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.rejected.query import \ + RejectedURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.convert import \ + convert_strategy_counts_to_strategy_count_dict +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.queries.base.builder import QueryBuilderBase +from src.db.templates.requester import RequesterBase + + +class GetBatchesAggregatedMetricsQueryRequester(RequesterBase): + + async def _run_strategy_count_query_builder( + self, query_builder: type[QueryBuilderBase]) -> dict[CollectorType, int]: + responses: list[CountByBatchStrategyResponse] = \ + await query_builder().run(self.session) + + return convert_strategy_counts_to_strategy_count_dict(responses) + + async def url_error_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(URLErrorByBatchStrategyQueryBuilder) + + async def url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountAllURLsByBatchStrategyQueryBuilder) + + async def submitted_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountSubmittedByBatchStrategyQueryBuilder) + + async def validated_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(ValidatedURLCountByBatchStrategyQueryBuilder) + + async def rejected_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(RejectedURLCountByBatchStrategyQueryBuilder) + + async def pending_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(PendingURLCountByBatchStrategyQueryBuilder) + + async def batch_status_by_collector_strategy(self) -> dict[ + CollectorType, + dict[BatchStatus, int] + ]: + + responses: list[BatchStatusCountByBatchStrategyResponseDTO] = \ + await BatchStatusByBatchStrategyQueryBuilder().run(self.session) + + result: dict[CollectorType, dict[BatchStatus, int]] = { + collector_type: { + BatchStatus.ERROR: 0, + BatchStatus.READY_TO_LABEL: 0, + } + for collector_type in CollectorType + } + for response in responses: + if response.status not in ( + BatchStatus.ERROR, + BatchStatus.READY_TO_LABEL + ): + continue + result[response.strategy][response.status] = response.count + + return result + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py new file mode 100644 index 00000000..ee8f8065 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -0,0 +1,45 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class CountSubmittedByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[ + CountByBatchStrategyResponse + ]: + query = ( + select( + Batch.strategy, + func.count(URLDataSource.id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results: list[CountByBatchStrategyResponse] = [] + for mapping in mappings: + results.append( + CountByBatchStrategyResponse( + strategy=CollectorType(mapping["strategy"]), + count=mapping["count"] + ) + ) + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py new file mode 100644 index 00000000..9bcc3a57 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -0,0 +1,34 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class URLErrorByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + query = ( + select( + Batch.strategy, + func.count(URL.id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .join(URL) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.strategy, URL.status) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results + + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py new file mode 100644 index 00000000..155cbcb0 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py @@ -0,0 +1,38 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class ValidatedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/breakdown/error/__init__.py b/src/api/endpoints/metrics/batches/breakdown/error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py new file mode 100644 index 00000000..ed2ff44f --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, func, CTE, Column + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.url.core.sqlalchemy import URL + +URL_ERROR_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_error") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URL, + URL.id == LinkBatchURL.url_id + ) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.id) + .cte("error") +) diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py new file mode 100644 index 00000000..20d32cf1 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py @@ -0,0 +1,27 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +NOT_RELEVANT_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_rejected") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT + ) + .group_by(Batch.id) + .cte("not_relevant") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py b/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py new file mode 100644 index 00000000..bf09f345 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py @@ -0,0 +1,26 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_pending") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.url_id.is_(None) + ) + .group_by(Batch.id) + .cte("pending") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 6fe0eb71..5847e309 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -1,13 +1,20 @@ -from sqlalchemy import select, case +from sqlalchemy import select, case, Column from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.functions import coalesce from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO, \ GetMetricsBatchesBreakdownInnerResponseDTO +from src.api.endpoints.metrics.batches.breakdown.error.cte_ import URL_ERROR_CTE +from src.api.endpoints.metrics.batches.breakdown.not_relevant.cte_ import NOT_RELEVANT_CTE +from src.api.endpoints.metrics.batches.breakdown.pending.cte_ import PENDING_CTE +from src.api.endpoints.metrics.batches.breakdown.submitted.cte_ import SUBMITTED_CTE +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.api.endpoints.metrics.batches.breakdown.total.cte_ import TOTAL_CTE +from src.api.endpoints.metrics.batches.breakdown.validated.cte_ import VALIDATED_CTE from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -32,28 +39,32 @@ async def run(self, session: AsyncSession) -> GetMetricsBatchesBreakdownResponse Batch.date_generated.label("created_at"), ) - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) + all_ctes: list[BatchesBreakdownURLCTE] = [ + URL_ERROR_CTE, + NOT_RELEVANT_CTE, + PENDING_CTE, + SUBMITTED_CTE, + TOTAL_CTE, + VALIDATED_CTE + ] + + count_columns: list[Column] = [ + cte.count for cte in all_ctes + ] + count_query = select( - LinkBatchURL.batch_id, - sc.count_distinct(URL.id, label="count_total"), - url_column(URLStatus.PENDING, label="count_pending"), - url_column(URLStatus.SUBMITTED, label="count_submitted"), - url_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - url_column(URLStatus.ERROR, label="count_error"), - url_column(URLStatus.VALIDATED, label="count_validated"), - ).join(URL, LinkBatchURL.url_id == URL.id).group_by( - LinkBatchURL.batch_id - ).subquery("url_count") + Batch.id.label("batch_id"), + *count_columns + ) + for cte in all_ctes: + count_query = count_query.outerjoin( + cte.query, + Batch.id == cte.batch_id + ) + + count_query = count_query.cte("url_count") + query = (select( main_query.c.strategy, diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py b/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py new file mode 100644 index 00000000..face1891 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(URLDataSource.id).label("count_submitted") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("submitted") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py b/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py new file mode 100644 index 00000000..3fdd7521 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py @@ -0,0 +1,20 @@ +from psycopg import Column +from sqlalchemy import CTE + + +class BatchesBreakdownURLCTE: + + def __init__(self, query: CTE): + self._query = query + + @property + def query(self) -> CTE: + return self._query + + @property + def batch_id(self) -> Column: + return self._query.columns[0] + + @property + def count(self) -> Column: + return self._query.columns[1] \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/total/__init__.py b/src/api/endpoints/metrics/batches/breakdown/total/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/total/cte_.py b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py new file mode 100644 index 00000000..33cf0c84 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py @@ -0,0 +1,15 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +TOTAL_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_total") + ) + .join(LinkBatchURL) + .group_by(Batch.id) + .cte("total") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py b/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py new file mode 100644 index 00000000..b6ff5ef1 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +VALIDATED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_validated") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("validated") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/__init__.py b/src/api/endpoints/metrics/urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/__init__.py b/src/api/endpoints/metrics/urls/aggregated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/core.py b/src/api/endpoints/metrics/urls/aggregated/query/core.py new file mode 100644 index 00000000..57bc4211 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/core.py @@ -0,0 +1,48 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.all import ALL_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.error import ERROR_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.pending import PENDING_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.rejected import REJECTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.submitted import SUBMITTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.validated import VALIDATED_SUBQUERY +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsAggregatedResponseDTO: + + oldest_pending_url_query = select( + URL.id, + URL.created_at + ).where( + URL.status == URLStatus.OK.value + ).order_by( + URL.created_at.asc() + ).limit(1) + + oldest_pending_url = await session.execute(oldest_pending_url_query) + oldest_pending_url = oldest_pending_url.one_or_none() + if oldest_pending_url is None: + oldest_pending_url_id = None + oldest_pending_created_at = None + else: + oldest_pending_url_id = oldest_pending_url.id + oldest_pending_created_at = oldest_pending_url.created_at + + return GetMetricsURLsAggregatedResponseDTO( + count_urls_total=await sh.scalar(session, query=ALL_SUBQUERY), + count_urls_pending=await sh.scalar(session, query=PENDING_SUBQUERY), + count_urls_submitted=await sh.scalar(session, query=SUBMITTED_SUBQUERY), + count_urls_validated=await sh.scalar(session, query=VALIDATED_SUBQUERY), + count_urls_rejected=await sh.scalar(session, query=REJECTED_SUBQUERY), + count_urls_errors=await sh.scalar(session, query=ERROR_SUBQUERY), + oldest_pending_url_id=oldest_pending_url_id, + oldest_pending_url_created_at=oldest_pending_created_at, + ) diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py new file mode 100644 index 00000000..a2d09217 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py @@ -0,0 +1,9 @@ +from sqlalchemy import select, func + +from src.db.models.impl.url.core.sqlalchemy import URL + +ALL_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py new file mode 100644 index 00000000..407b0e4b --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py @@ -0,0 +1,11 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL + +ERROR_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .where(URL.status == URLStatus.ERROR) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py new file mode 100644 index 00000000..31d8e2b6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py @@ -0,0 +1,19 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +PENDING_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + URL.status == URLStatus.OK, + FlagURLValidated.url_id.is_(None), + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py new file mode 100644 index 00000000..e4f6d823 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +REJECTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py new file mode 100644 index 00000000..34be5e26 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py @@ -0,0 +1,14 @@ +from sqlalchemy import func, select + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + URLDataSource, + URL.id == URLDataSource.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py new file mode 100644 index 00000000..fb771db6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py @@ -0,0 +1,14 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +VALIDATED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/breakdown/__init__.py b/src/api/endpoints/metrics/urls/breakdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/__init__.py b/src/api/endpoints/metrics/urls/breakdown/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py new file mode 100644 index 00000000..3fc52c3f --- /dev/null +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -0,0 +1,91 @@ +from typing import Any + +from sqlalchemy import select, case, literal, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseInnerDTO, \ + GetMetricsURLsBreakdownPendingResponseDTO +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsBreakdownPendingMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResponseDTO: + + flags = ( + select( + URL.id.label("url_id"), + case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_record_type_annotation" + ), + case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_relevant_annotation" + ), + case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_agency_annotation" + ), + ) + .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) + .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) + .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + ).cte("flags") + + month = func.date_trunc('month', URL.created_at) + + # Build the query + query = ( + select( + month.label('month'), + func.count(URL.id).label('count_total'), + func.count( + case( + (flags.c.has_user_record_type_annotation == True, 1) + ) + ).label('user_record_type_count'), + func.count( + case( + (flags.c.has_user_relevant_annotation == True, 1) + ) + ).label('user_relevant_count'), + func.count( + case( + (flags.c.has_user_agency_annotation == True, 1) + ) + ).label('user_agency_count'), + ) + .outerjoin(flags, flags.c.url_id == URL.id) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + FlagURLValidated.url_id.is_(None), + URL.status == URLStatus.OK + ) + .group_by(month) + .order_by(month.asc()) + ) + + # Execute the query and return the results + results = await session.execute(query) + all_results = results.all() + final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] + + for result in all_results: + dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( + month=result.month.strftime("%B %Y"), + count_pending_total=result.count_total, + count_pending_relevant_user=result.user_relevant_count, + count_pending_record_type_user=result.user_record_type_count, + count_pending_agency_user=result.user_agency_count, + ) + final_results.append(dto) + return GetMetricsURLsBreakdownPendingResponseDTO( + entries=final_results, + ) \ No newline at end of file diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index af810a2b..8af9af03 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -9,6 +9,8 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -30,76 +32,38 @@ def __init__( async def run(self, session: AsyncSession) -> None: # Get URL + url = await self._get_url(session) - query = ( - Select(URL) - .where(URL.id == self.approval_info.url_id) - .options( - joinedload(URL.optional_data_source_metadata), - joinedload(URL.confirmed_agencies), - ) - ) - - url = await session.execute(query) - url = url.scalars().first() - - update_if_not_none( - url, - "record_type", - self.approval_info.record_type.value - if self.approval_info.record_type is not None else None, - required=True - ) + await self._optionally_update_record_type(url) # Get existing agency ids existing_agencies = url.confirmed_agencies or [] existing_agency_ids = [agency.agency_id for agency in existing_agencies] new_agency_ids = self.approval_info.agency_ids or [] - if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail="Must specify agency_id if URL does not already have a confirmed agency" - ) + await self._check_for_unspecified_agency_ids(existing_agency_ids, new_agency_ids) - # Get any existing agency ids that are not in the new agency ids - # If new agency ids are specified, overwrite existing - if len(new_agency_ids) != 0: - for existing_agency in existing_agencies: - if existing_agency.id not in new_agency_ids: - # If the existing agency id is not in the new agency ids, delete it - await session.delete(existing_agency) + await self._overwrite_existing_agencies(existing_agencies, new_agency_ids, session) # Add any new agency ids that are not in the existing agency ids - for new_agency_id in new_agency_ids: - if new_agency_id not in existing_agency_ids: - # Check if the new agency exists in the database - query = ( - select(Agency) - .where(Agency.agency_id == new_agency_id) - ) - existing_agency = await session.execute(query) - existing_agency = existing_agency.scalars().first() - if existing_agency is None: - # If not, create it - agency = Agency( - agency_id=new_agency_id, - name=PLACEHOLDER_AGENCY_NAME, - ) - session.add(agency) - - # If the new agency id is not in the existing agency ids, add it - confirmed_url_agency = LinkURLAgency( - url_id=self.approval_info.url_id, - agency_id=new_agency_id - ) - session.add(confirmed_url_agency) + await self._add_new_agencies(existing_agency_ids, new_agency_ids, session) - # If it does, do nothing + await self._add_validated_flag(session, url=url) - url.status = URLStatus.VALIDATED.value + await self._optionally_update_required_metadata(url) + await self._optionally_update_optional_metdata(url) + await self._add_approving_user(session) + async def _optionally_update_required_metadata(self, url: URL) -> None: update_if_not_none(url, "name", self.approval_info.name, required=True) update_if_not_none(url, "description", self.approval_info.description, required=False) + async def _add_approving_user(self, session: AsyncSession) -> None: + approving_user_url = ReviewingUserURL( + user_id=self.user_id, + url_id=self.approval_info.url_id + ) + session.add(approving_user_url) + + async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( @@ -124,10 +88,85 @@ async def run(self, session: AsyncSession) -> None: self.approval_info.supplying_entity ) - # Add approving user - approving_user_url = ReviewingUserURL( - user_id=self.user_id, - url_id=self.approval_info.url_id + async def _optionally_update_record_type(self, url: URL) -> None: + update_if_not_none( + url, + "record_type", + self.approval_info.record_type.value + if self.approval_info.record_type is not None else None, + required=True ) - session.add(approving_user_url) \ No newline at end of file + async def _get_url(self, session: AsyncSession) -> URL: + query = ( + Select(URL) + .where(URL.id == self.approval_info.url_id) + .options( + joinedload(URL.optional_data_source_metadata), + joinedload(URL.confirmed_agencies), + ) + ) + url = await session.execute(query) + url = url.scalars().first() + return url + + async def _check_for_unspecified_agency_ids( + self, + existing_agency_ids: list[int], + new_agency_ids: list[int] + ) -> None: + """ + raises: + HTTPException: If no agency ids are specified and no existing agency ids are found + """ + if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail="Must specify agency_id if URL does not already have a confirmed agency" + ) + + async def _overwrite_existing_agencies(self, existing_agencies, new_agency_ids, session): + # Get any existing agency ids that are not in the new agency ids + # If new agency ids are specified, overwrite existing + if len(new_agency_ids) != 0: + for existing_agency in existing_agencies: + if existing_agency.id not in new_agency_ids: + # If the existing agency id is not in the new agency ids, delete it + await session.delete(existing_agency) + + async def _add_new_agencies(self, existing_agency_ids, new_agency_ids, session): + for new_agency_id in new_agency_ids: + if new_agency_id in existing_agency_ids: + continue + # Check if the new agency exists in the database + query = ( + select(Agency) + .where(Agency.agency_id == new_agency_id) + ) + existing_agency = await session.execute(query) + existing_agency = existing_agency.scalars().first() + if existing_agency is None: + # If not, create it + agency = Agency( + agency_id=new_agency_id, + name=PLACEHOLDER_AGENCY_NAME, + ) + session.add(agency) + + # If the new agency id is not in the existing agency ids, add it + confirmed_url_agency = LinkURLAgency( + url_id=self.approval_info.url_id, + agency_id=new_agency_id + ) + session.add(confirmed_url_agency) + + async def _add_validated_flag( + self, + session: AsyncSession, + url: URL + ) -> None: + flag = FlagURLValidated( + url_id=url.id, + type=ValidatedURLType.DATA_SOURCE + ) + session.add(flag) diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 7cb4670b..e7314edd 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -1,4 +1,4 @@ -from typing import Optional, Type +from typing import Type from sqlalchemy import FromClause, select, and_, Select, desc, asc, func from sqlalchemy.ext.asyncio import AsyncSession @@ -13,7 +13,8 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -93,7 +94,7 @@ def _build_base_query( query = ( query.where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, *where_exist_clauses ) ) @@ -189,7 +190,7 @@ async def get_count_ready_query(self): ) .where( LinkBatchURL.batch_id == self.batch_id, - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, *self._get_where_exist_clauses( builder.query ) @@ -203,22 +204,12 @@ async def get_count_reviewed_query(self): count_reviewed_query = ( select( Batch.id.label("batch_id"), - func.count(URL.id).label(self.count_label) + func.count(FlagURLValidated.url_id).label(self.count_label) ) .select_from(Batch) .join(LinkBatchURL) - .outerjoin(URL, URL.id == LinkBatchURL.url_id) - .where( - URL.status.in_( - [ - URLStatus.VALIDATED.value, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - URLStatus.INDIVIDUAL_RECORD.value - ] - ), - LinkBatchURL.batch_id == self.batch_id - ) + .outerjoin(FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id) + .group_by(Batch.id) .subquery("count_reviewed") ) diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 7d603fe1..c9593a01 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,6 +5,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -33,19 +35,27 @@ async def run(self, session) -> None: url = await session.execute(query) url = url.scalars().first() + validation_type: ValidatedURLType | None = None match self.rejection_reason: case RejectionReason.INDIVIDUAL_RECORD: - url.status = URLStatus.INDIVIDUAL_RECORD.value + validation_type = ValidatedURLType.INDIVIDUAL_RECORD case RejectionReason.BROKEN_PAGE_404: url.status = URLStatus.NOT_FOUND.value case RejectionReason.NOT_RELEVANT: - url.status = URLStatus.NOT_RELEVANT.value + validation_type = ValidatedURLType.NOT_RELEVANT case _: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, detail="Invalid rejection reason" ) + if validation_type is not None: + flag_url_validated = FlagURLValidated( + url_id=self.url_id, + type=validation_type + ) + session.add(flag_url_validated) + # Add rejecting user rejecting_user_url = ReviewingUserURL( user_id=self.user_id, diff --git a/src/collectors/enums.py b/src/collectors/enums.py index 1732bd19..c357d6bf 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -11,11 +11,7 @@ class CollectorType(Enum): MANUAL = "manual" class URLStatus(Enum): - PENDING = "pending" - SUBMITTED = "submitted" - VALIDATED = "validated" + OK = "ok" ERROR = "error" DUPLICATE = "duplicate" - NOT_RELEVANT = "not relevant" NOT_FOUND = "404 not found" - INDIVIDUAL_RECORD = "individual record" diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 96365107..af72a3aa 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/core.py b/src/core/core.py index c597a591..0938586a 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager diff --git a/src/core/enums.py b/src/core/enums.py index c6f90c80..edc18425 100644 --- a/src/core/enums.py +++ b/src/core/enums.py @@ -16,6 +16,7 @@ class RecordType(Enum): """ All available URL record types """ + # Police and Public ACCIDENT_REPORTS = "Accident Reports" ARREST_RECORDS = "Arrest Records" CALLS_FOR_SERVICE = "Calls for Service" @@ -31,16 +32,22 @@ class RecordType(Enum): SURVEYS = "Surveys" USE_OF_FORCE_REPORTS = "Use of Force Reports" VEHICLE_PURSUITS = "Vehicle Pursuits" + + # Info About Officers COMPLAINTS_AND_MISCONDUCT = "Complaints & Misconduct" DAILY_ACTIVITY_LOGS = "Daily Activity Logs" TRAINING_AND_HIRING_INFO = "Training & Hiring Info" PERSONNEL_RECORDS = "Personnel Records" + + # Info About Agencies ANNUAL_AND_MONTHLY_REPORTS = "Annual & Monthly Reports" BUDGETS_AND_FINANCES = "Budgets & Finances" CONTACT_INFO_AND_AGENCY_META = "Contact Info & Agency Meta" GEOGRAPHIC = "Geographic" LIST_OF_DATA_SOURCES = "List of Data Sources" POLICIES_AND_CONTRACTS = "Policies & Contracts" + + # Agency-Published Resources CRIME_MAPS_AND_REPORTS = "Crime Maps & Reports" CRIME_STATISTICS = "Crime Statistics" MEDIA_BULLETINS = "Media Bulletins" @@ -48,9 +55,13 @@ class RecordType(Enum): RESOURCES = "Resources" SEX_OFFENDER_REGISTRY = "Sex Offender Registry" WANTED_PERSONS = "Wanted Persons" + + # Jails and Courts Specific BOOKING_REPORTS = "Booking Reports" COURT_CASES = "Court Cases" INCARCERATION_RECORDS = "Incarceration Records" + + # Other OTHER = "Other" diff --git a/src/core/exceptions.py b/src/core/exceptions.py index d4f9c4a8..a361a24d 100644 --- a/src/core/exceptions.py +++ b/src/core/exceptions.py @@ -14,3 +14,4 @@ class MatchAgencyError(Exception): class FailedValidationException(HTTPException): def __init__(self, detail: str): super().__init__(status_code=HTTPStatus.BAD_REQUEST, detail=detail) + diff --git a/src/core/tasks/scheduled/impl/huggingface/operator.py b/src/core/tasks/scheduled/impl/huggingface/operator.py index 7d5324f5..9bb7a85e 100644 --- a/src/core/tasks/scheduled/impl/huggingface/operator.py +++ b/src/core/tasks/scheduled/impl/huggingface/operator.py @@ -1,12 +1,19 @@ from itertools import count +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.huggingface.hub.client import HuggingFaceHubClient -class PushToHuggingFaceTaskOperator(ScheduledTaskOperatorBase): +class PushToHuggingFaceTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): @property def task_type(self) -> TaskType: @@ -20,21 +27,23 @@ def __init__( super().__init__(adb_client) self.hf_client = hf_client - async def inner_task_logic(self): - # Check if any valid urls have been updated - valid_urls_updated = await self.adb_client.check_valid_urls_updated() - print(f"Valid urls updated: {valid_urls_updated}") - if not valid_urls_updated: - print("No valid urls updated, skipping.") - return - + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + CheckValidURLsUpdatedQueryBuilder() + ) - # Otherwise, push to huggingface + async def inner_task_logic(self): + """Push raw data sources to huggingface.""" run_dt = await self.adb_client.get_current_database_time() for idx in count(start=1): - outputs = await self.adb_client.get_data_sources_raw_for_huggingface(page=idx) + outputs: list[GetForLoadingToHuggingFaceOutput] = await self._get_data_sources_raw_for_huggingface(page=idx) if len(outputs) == 0: break self.hf_client.push_data_sources_raw_to_hub(outputs, idx=idx) await self.adb_client.set_hugging_face_upload_state(run_dt.replace(tzinfo=None)) + + async def _get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: + return await self.adb_client.run_query_builder( + GetForLoadingToHuggingFaceQueryBuilder(page) + ) diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py index 23e0b0b6..25124c95 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py @@ -1,4 +1,5 @@ from datetime import datetime +from operator import or_ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -6,6 +7,7 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.state.huggingface import HuggingFaceUploadState from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL @@ -34,14 +36,12 @@ async def has_valid_urls(self, last_upload_at: datetime | None) -> bool: URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_( - [ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - ] - ), + FlagURLValidated.url_id.isnot(None) ) ) if last_upload_at is not None: diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py index 9d5c4135..b9056dcb 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py @@ -1,8 +1,7 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING, \ - OUTCOME_RELEVANCY_MAPPING +from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType def convert_fine_to_coarse_record_type( @@ -10,7 +9,14 @@ def convert_fine_to_coarse_record_type( ) -> RecordTypeCoarse: return FINE_COARSE_RECORD_TYPE_MAPPING[fine_record_type] -def convert_url_status_to_relevant( - url_status: URLStatus + +def convert_validated_type_to_relevant( + validated_type: ValidatedURLType ) -> bool: - return OUTCOME_RELEVANCY_MAPPING[url_status] \ No newline at end of file + match validated_type: + case ValidatedURLType.NOT_RELEVANT: + return False + case ValidatedURLType.DATA_SOURCE: + return True + case _: + raise ValueError(f"Disallowed validated type: {validated_type}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 662f7fbb..f440360c 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -1,16 +1,18 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.collectors.enums import URLStatus -from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_url_status_to_relevant, \ - convert_fine_to_coarse_record_type +from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \ + convert_validated_type_to_relevant from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html -from src.db.helpers.session import session_helper as sh + class GetForLoadingToHuggingFaceQueryBuilder(QueryBuilderBase): @@ -22,29 +24,32 @@ def __init__(self, page: int): async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]: label_url_id = 'url_id' label_url = 'url' - label_url_status = 'url_status' label_record_type_fine = 'record_type_fine' label_html = 'html' + label_type = 'type' query = ( select( URL.id.label(label_url_id), URL.url.label(label_url), - URL.status.label(label_url_status), URL.record_type.label(label_record_type_fine), - URLCompressedHTML.compressed_html.label(label_html) + URLCompressedHTML.compressed_html.label(label_html), + FlagURLValidated.type.label(label_type) ) .join( URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_([ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT, - URLStatus.SUBMITTED - ]) + FlagURLValidated.type.in_( + (ValidatedURLType.DATA_SOURCE, + ValidatedURLType.NOT_RELEVANT) + ) ) ) query = add_standard_limit_and_offset(page=self.page, statement=query) @@ -57,7 +62,9 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut output = GetForLoadingToHuggingFaceOutput( url_id=result[label_url_id], url=result[label_url], - relevant=convert_url_status_to_relevant(result[label_url_status]), + relevant=convert_validated_type_to_relevant( + ValidatedURLType(result[label_type]) + ), record_type_fine=result[label_record_type_fine], record_type_coarse=convert_fine_to_coarse_record_type( result[label_record_type_fine] diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py index ed4a7da2..0fd12b28 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py @@ -47,9 +47,3 @@ RecordType.OTHER: RecordTypeCoarse.OTHER, None: RecordTypeCoarse.NOT_RELEVANT } - -OUTCOME_RELEVANCY_MAPPING = { - URLStatus.SUBMITTED: True, - URLStatus.VALIDATED: True, - URLStatus.NOT_RELEVANT: False -} \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py new file mode 100644 index 00000000..7e131b89 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py @@ -0,0 +1,24 @@ +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import URLDataSyncInfo +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.external.pdap.enums import ApprovalStatus + + +def convert_url_sync_info_to_url_mappings( + url_sync_info: URLDataSyncInfo +) -> URLMapping: + return URLMapping( + url=url_sync_info.url, + url_id=url_sync_info.url_id + ) + +def convert_approval_status_to_validated_type( + approval_status: ApprovalStatus +) -> ValidatedURLType: + match approval_status: + case ApprovalStatus.APPROVED: + return ValidatedURLType.DATA_SOURCE + case ApprovalStatus.REJECTED: + return ValidatedURLType.NOT_RELEVANT + case _: + raise ValueError(f"Invalid approval status: {approval_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py index 751192f9..2b021045 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py @@ -3,6 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import convert_url_sync_info_to_url_mappings from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ get_mappings_for_urls_without_data_sources from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -14,8 +15,11 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ LookupURLForDataSourcesSyncResponse from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper @final @@ -50,24 +54,36 @@ async def run(self, session: AsyncSession) -> None: """ self._session = session - lookup_results = await self._lookup_urls() - lookups_existing_urls = filter_for_urls_with_ids(lookup_results) + lookup_results: list[LookupURLForDataSourcesSyncResponse] = await self._lookup_urls() + + # Update existing url and associated metadata + lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse] = filter_for_urls_with_ids(lookup_results) await self._update_existing_urls(lookups_existing_urls) await self._update_agency_link(lookups_existing_urls) - mappings_without_data_sources = get_mappings_for_urls_without_data_sources(lookup_results) - await self._add_new_data_sources(mappings_without_data_sources) + existing_url_mappings: list[URLMapping] = [ + convert_url_sync_info_to_url_mappings(lookup.url_info) + for lookup in lookups_existing_urls + ] - extant_urls = {lookup.url_info.url for lookup in lookups_existing_urls} - urls_to_add = list(self.urls - extant_urls) - if len(urls_to_add) == 0: - return - url_mappings = await self._add_new_urls(urls_to_add) - await self._add_new_data_sources(url_mappings) - await self._insert_agency_link(url_mappings) - - async def _lookup_urls(self): - lookup_results = await self.requester.lookup_urls(list(self.urls)) - return lookup_results + # Add new URLs and associated metadata + mappings_without_data_sources: list[URLMapping] = get_mappings_for_urls_without_data_sources(lookup_results) + await self._add_new_data_sources(mappings_without_data_sources) + extant_urls: set[str] = {lookup.url_info.url for lookup in lookups_existing_urls} + urls_to_add: list[str] = list(self.urls - extant_urls) + if len(urls_to_add) != 0: + new_url_mappings: list[URLMapping] = await self._add_new_urls(urls_to_add) + await self._add_new_data_sources(new_url_mappings) + await self._insert_agency_link(new_url_mappings) + else: + new_url_mappings: list[URLMapping] = [] + + # Upsert validated flags + all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings + mapper = URLMapper(all_url_mappings) + await self._upsert_validated_flags(mapper) + + async def _lookup_urls(self) -> list[LookupURLForDataSourcesSyncResponse]: + return await self.requester.lookup_urls(list(self.urls)) async def _insert_agency_link(self, url_mappings: list[URLMapping]): link_url_agency_insert_params = self.param_manager.insert_agency_link( @@ -81,16 +97,19 @@ async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForData ) await self.requester.update_agency_links(link_url_agency_update_params) - async def _add_new_data_sources(self, url_mappings: list[URLMapping]): + async def _add_new_data_sources(self, url_mappings: list[URLMapping]) -> None: url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) await self.requester.add_new_data_sources(url_ds_insert_params) - async def _add_new_urls(self, urls: list[str]): + async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: url_insert_params: list[InsertURLForDataSourcesSyncParams] = self.param_manager.add_new_urls(urls) url_mappings = await self.requester.add_new_urls(url_insert_params) return url_mappings - async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]) -> None: update_params = self.param_manager.update_existing_urls(lookups_existing_urls) await self.requester.update_existing_urls(update_params) + async def _upsert_validated_flags(self, url_mapper: URLMapper) -> None: + flags: list[FlagURLValidatedPydantic] = self.param_manager.upsert_validated_flags(url_mapper) + await self.requester.upsert_validated_flags(flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py index 3240e409..168f2511 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py @@ -23,13 +23,13 @@ def convert_to_source_collector_url_status( match ds_approval_status: case ApprovalStatus.APPROVED: - return URLStatus.SUBMITTED + return URLStatus.OK case ApprovalStatus.REJECTED: return URLStatus.NOT_RELEVANT case ApprovalStatus.NEEDS_IDENTIFICATION: - return URLStatus.PENDING + return URLStatus.OK case ApprovalStatus.PENDING: - return URLStatus.PENDING + return URLStatus.OK case _: raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index 7ca8ebad..6493d3c8 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -1,5 +1,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import \ + convert_approval_status_to_validated_type from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ convert_to_url_insert_params from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -10,8 +12,14 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus +from src.util.url_mapper import URLMapper class UpsertURLsFromDataSourcesParamManager: @@ -98,4 +106,21 @@ def add_new_data_sources( ) return results + def upsert_validated_flags( + self, + mapper: URLMapper + ) -> list[FlagURLValidatedPydantic]: + urls: list[str] = mapper.get_all_urls() + flags: list[FlagURLValidatedPydantic] = [] + for url in urls: + url_id: int = mapper.get_id(url) + sync_info: DataSourcesSyncResponseInnerInfo = self._mapper.get(url) + approval_status: ApprovalStatus = sync_info.approval_status + validated_type: ValidatedURLType = convert_approval_status_to_validated_type(approval_status) + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=validated_type + ) + flags.append(flag) + return flags \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py index 08b5df22..e91cd229 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py @@ -14,6 +14,7 @@ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic @@ -75,4 +76,7 @@ async def update_agency_links( ) -> None: """Overwrite existing url_agency links with new ones, if applicable.""" query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(self.session) \ No newline at end of file + await query.run(self.session) + + async def upsert_validated_flags(self, flags: list[FlagURLValidatedPydantic]) -> None: + await sh.bulk_upsert(self.session, models=flags) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py index 5eeb4355..b3280cf2 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -21,7 +21,7 @@ async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: Batch.strategy ) .select_from(URL) - .where(URL.status == URLStatus.PENDING.value) + .where(URL.status == URLStatus.OK.value) .outerjoin(LinkBatchURL) .outerjoin(Batch) ) diff --git a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py index e8a0e8ce..9877675b 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py @@ -17,7 +17,7 @@ async def run( select( URL.id ).where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ) ) diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index b3ba90ec..384cb5c4 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[URLRelevantTDO]: .join(URLCompressedHTML) .outerjoin(AutoRelevantSuggestion) .where( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, AutoRelevantSuggestion.id.is_(None), ) ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 3b994f86..3af3c8db 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -26,8 +26,9 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.api.endpoints.collector.manual.query import UploadManualBatchQueryBuilder +from src.api.endpoints.metrics.backlog.query import GetBacklogMetricsQueryBuilder from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO -from src.api.endpoints.metrics.batches.aggregated.query import GetBatchesAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.core import GetBatchesAggregatedMetricsQueryBuilder from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO from src.api.endpoints.metrics.batches.breakdown.query import GetBatchesBreakdownMetricsQueryBuilder from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO @@ -36,6 +37,8 @@ GetMetricsURLsBreakdownPendingResponseInnerDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO, \ GetMetricsURLsBreakdownSubmittedInnerDTO +from src.api.endpoints.metrics.urls.aggregated.query.core import GetURLsAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.urls.breakdown.query.core import GetURLsBreakdownPendingMetricsQueryBuilder from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.core import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason @@ -52,9 +55,6 @@ from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager -from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder @@ -106,9 +106,10 @@ from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.backlog_snapshot import BacklogSnapshot -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.task_url import LinkTaskURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.log.pydantic.info import LogInfo @@ -546,7 +547,7 @@ async def get_urls_with_html_data_and_without_models( ): statement = (select(URL) .options(selectinload(URL.html_content)) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, model=model @@ -575,7 +576,7 @@ async def has_urls_with_html_data_and_without_models( ) -> bool: statement = (select(URL) .join(URLCompressedHTML) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) # Exclude URLs with auto suggested record types statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, @@ -614,9 +615,11 @@ async def get_urls( page: int, errors: bool ) -> GetURLsResponseInfo: - return await self.run_query_builder(GetURLsQueryBuilder( - page=page, errors=errors - )) + return await self.run_query_builder( + GetURLsQueryBuilder( + page=page, errors=errors + ) + ) @session_manager async def initiate_task( @@ -734,10 +737,12 @@ async def get_next_url_agency_for_annotation( user_id: int, batch_id: int | None ) -> GetNextURLForAgencyAnnotationResponse: - return await self.run_query_builder(builder=GetNextURLAgencyForAnnotationQueryBuilder( - user_id=user_id, - batch_id=batch_id - )) + return await self.run_query_builder( + builder=GetNextURLAgencyForAnnotationQueryBuilder( + user_id=user_id, + batch_id=batch_id + ) + ) @session_manager async def upsert_new_agencies( @@ -783,7 +788,8 @@ async def add_agency_auto_suggestions( url_agency_suggestion = AutomatedUrlAgencySuggestion( url_id=suggestion.url_id, agency_id=suggestion.pdap_agency_id, - is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN + is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN, + confidence=0 ) session.add(url_agency_suggestion) @@ -842,10 +848,12 @@ async def approve_url( approval_info: FinalReviewApprovalInfo, user_id: int, ) -> None: - await self.run_query_builder(ApproveURLQueryBuilder( - user_id=user_id, - approval_info=approval_info - )) + await self.run_query_builder( + ApproveURLQueryBuilder( + user_id=user_id, + approval_info=approval_info + ) + ) async def reject_url( self, @@ -853,11 +861,13 @@ async def reject_url( user_id: int, rejection_reason: RejectionReason ) -> None: - await self.run_query_builder(RejectURLQueryBuilder( - url_id=url_id, - user_id=user_id, - rejection_reason=rejection_reason - )) + await self.run_query_builder( + RejectURLQueryBuilder( + url_id=url_id, + user_id=user_id, + rejection_reason=rejection_reason + ) + ) @session_manager async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]: @@ -873,10 +883,12 @@ async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo]: """Retrieve all URLs associated with a batch.""" - return await self.run_query_builder(GetURLsByBatchQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetURLsByBatchQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def insert_logs( @@ -926,8 +938,6 @@ async def insert_urls( ) return await self.run_query_builder(builder) - - @session_manager async def update_batch_post_collection( self, @@ -960,10 +970,12 @@ async def mark_urls_as_submitted(self, infos: list[SubmittedURLInfo]): await self.run_query_builder(MarkURLsAsSubmittedQueryBuilder(infos)) async def get_duplicates_by_batch_id(self, batch_id: int, page: int) -> list[DuplicateInfo]: - return await self.run_query_builder(GetDuplicatesByBatchIDQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetDuplicatesByBatchIDQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def get_batch_summaries( @@ -1048,10 +1060,12 @@ async def upload_manual_batch( user_id: int, dto: ManualBatchInputDTO ) -> ManualBatchResponseDTO: - return await self.run_query_builder(UploadManualBatchQueryBuilder( - user_id=user_id, - dto=dto - )) + return await self.run_query_builder( + UploadManualBatchQueryBuilder( + user_id=user_id, + dto=dto + ) + ) @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: @@ -1114,183 +1128,16 @@ async def get_urls_breakdown_submitted_metrics( entries=final_results ) - @session_manager - async def get_urls_aggregated_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsAggregatedResponseDTO: - sc = StatementComposer - - oldest_pending_url_query = select( - URL.id, - URL.created_at - ).where( - URL.status == URLStatus.PENDING.value - ).order_by( - URL.created_at.asc() - ).limit(1) - - oldest_pending_url = await session.execute(oldest_pending_url_query) - oldest_pending_url = oldest_pending_url.one_or_none() - if oldest_pending_url is None: - oldest_pending_url_id = None - oldest_pending_created_at = None - else: - oldest_pending_url_id = oldest_pending_url.id - oldest_pending_created_at = oldest_pending_url.created_at - - def case_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) + async def get_urls_aggregated_metrics(self) -> GetMetricsURLsAggregatedResponseDTO: + return await self.run_query_builder(GetURLsAggregatedMetricsQueryBuilder()) - count_query = select( - sc.count_distinct(URL.id, label="count"), - case_column(URLStatus.PENDING, label="count_pending"), - case_column(URLStatus.SUBMITTED, label="count_submitted"), - case_column(URLStatus.VALIDATED, label="count_validated"), - case_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - case_column(URLStatus.ERROR, label="count_error"), - ) - raw_results = await session.execute(count_query) - results = raw_results.all() + async def get_urls_breakdown_pending_metrics(self) -> GetMetricsURLsBreakdownPendingResponseDTO: + return await self.run_query_builder(GetURLsBreakdownPendingMetricsQueryBuilder()) - return GetMetricsURLsAggregatedResponseDTO( - count_urls_total=results[0].count, - count_urls_pending=results[0].count_pending, - count_urls_submitted=results[0].count_submitted, - count_urls_validated=results[0].count_validated, - count_urls_rejected=results[0].count_rejected, - count_urls_errors=results[0].count_error, - oldest_pending_url_id=oldest_pending_url_id, - oldest_pending_url_created_at=oldest_pending_created_at, - ) - - @session_manager - async def get_urls_breakdown_pending_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsBreakdownPendingResponseDTO: - sc = StatementComposer - - flags = ( - select( - URL.id.label("url_id"), - case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_record_type_annotation" - ), - case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_relevant_annotation" - ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_agency_annotation" - ), - ) - .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) - .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) - ).cte("flags") - - month = func.date_trunc('month', URL.created_at) - - # Build the query - query = ( - select( - month.label('month'), - func.count(URL.id).label('count_total'), - func.count( - case( - (flags.c.has_user_record_type_annotation == True, 1) - ) - ).label('user_record_type_count'), - func.count( - case( - (flags.c.has_user_relevant_annotation == True, 1) - ) - ).label('user_relevant_count'), - func.count( - case( - (flags.c.has_user_agency_annotation == True, 1) - ) - ).label('user_agency_count'), - ) - .outerjoin(flags, flags.c.url_id == URL.id) - .where(URL.status == URLStatus.PENDING.value) - .group_by(month) - .order_by(month.asc()) - ) - - # Execute the query and return the results - results = await session.execute(query) - all_results = results.all() - final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] - - for result in all_results: - dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( - month=result.month.strftime("%B %Y"), - count_pending_total=result.count_total, - count_pending_relevant_user=result.user_relevant_count, - count_pending_record_type_user=result.user_record_type_count, - count_pending_agency_user=result.user_agency_count, - ) - final_results.append(dto) - return GetMetricsURLsBreakdownPendingResponseDTO( - entries=final_results, - ) - - @session_manager async def get_backlog_metrics( self, - session: AsyncSession ) -> GetMetricsBacklogResponseDTO: - month = func.date_trunc('month', BacklogSnapshot.created_at) - - # 1. Create a subquery that assigns row_number() partitioned by month - monthly_snapshot_subq = ( - select( - BacklogSnapshot.id, - BacklogSnapshot.created_at, - BacklogSnapshot.count_pending_total, - month.label("month_start"), - func.row_number() - .over( - partition_by=month, - order_by=BacklogSnapshot.created_at.desc() - ) - .label("row_number") - ) - .subquery() - ) - - # 2. Filter for the top (most recent) row in each month - stmt = ( - select( - monthly_snapshot_subq.c.month_start, - monthly_snapshot_subq.c.created_at, - monthly_snapshot_subq.c.count_pending_total - ) - .where(monthly_snapshot_subq.c.row_number == 1) - .order_by(monthly_snapshot_subq.c.month_start) - ) - - raw_result = await session.execute(stmt) - results = raw_result.all() - final_results = [] - for result in results: - final_results.append( - GetMetricsBacklogResponseInnerDTO( - month=result.month_start.strftime("%B %Y"), - count_pending_total=result.count_pending_total, - ) - ) - - return GetMetricsBacklogResponseDTO(entries=final_results) + return await self.run_query_builder(GetBacklogMetricsQueryBuilder()) @session_manager async def populate_backlog_snapshot( @@ -1300,10 +1147,15 @@ async def populate_backlog_snapshot( ): sc = StatementComposer # Get count of pending URLs - query = select( - sc.count_distinct(URL.id, label="count") - ).where( - URL.status == URLStatus.PENDING.value + query = ( + select( + sc.count_distinct(URL.id, label="count") + ) + .outerjoin(FlagURLValidated, URL.id == FlagURLValidated.url_id) + .where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None), + ) ) raw_result = await session.execute(query) @@ -1355,7 +1207,7 @@ async def has_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1378,7 +1230,7 @@ async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1463,21 +1315,11 @@ async def add_raw_html( ) session.add(compressed_html) - async def get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: - return await self.run_query_builder( - GetForLoadingToHuggingFaceQueryBuilder(page) - ) - async def set_hugging_face_upload_state(self, dt: datetime) -> None: await self.run_query_builder( SetHuggingFaceUploadStateQueryBuilder(dt=dt) ) - async def check_valid_urls_updated(self) -> bool: - return await self.run_query_builder( - CheckValidURLsUpdatedQueryBuilder() - ) - async def get_current_database_time(self) -> datetime: return await self.scalar(select(func.now())) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 03a45d3b..04ecc892 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,5 +1,5 @@ from functools import wraps -from typing import Optional, List +from typing import List from sqlalchemy import create_engine, update, Select from sqlalchemy.exc import IntegrityError @@ -7,12 +7,12 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.log.pydantic.info import LogInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base from src.db.models.impl.duplicate.sqlalchemy import Duplicate @@ -58,6 +58,11 @@ def wrapper(self, *args, **kwargs): return wrapper + @session_manager + def add_all(self, session: Session, objects: list[Base]): + session.add_all(objects) + session.commit() + @session_manager def insert_batch(self, session: Session, batch_info: BatchInfo) -> int: """Insert a new batch into the database and return its ID.""" @@ -221,14 +226,6 @@ def mark_urls_as_submitted( url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -237,7 +234,6 @@ def mark_urls_as_submitted( url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - session.execute(query) if __name__ == "__main__": client = DatabaseClient() diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index a616664f..290ae2bd 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -51,21 +51,27 @@ async def has_results(session: AsyncSession, query: sa.Select) -> bool: async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], -): +) -> None: if len(models) == 0: return + # Parse models to get sa_model and id_field parser = BulkActionParser(models) + # Create base insert query query = pg_insert(parser.sa_model) - upsert_mappings = [upsert_model.model_dump() for upsert_model in models] + upsert_mappings: list[dict[str, Any]] = [ + upsert_model.model_dump() for upsert_model in models + ] + # Set all non-id fields to the values in the upsert mapping set_ = {} for k, v in upsert_mappings[0].items(): if k == parser.id_field: continue set_[k] = getattr(query.excluded, k) + # Add upsert logic to update on conflict query = query.on_conflict_do_update( index_elements=[parser.id_field], set_=set_ diff --git a/src/db/models/impl/batch/pydantic/__init__.py b/src/db/models/impl/batch/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/batch/pydantic.py b/src/db/models/impl/batch/pydantic/info.py similarity index 100% rename from src/db/models/impl/batch/pydantic.py rename to src/db/models/impl/batch/pydantic/info.py diff --git a/src/db/models/impl/batch/pydantic/insert.py b/src/db/models/impl/batch/pydantic/insert.py new file mode 100644 index 00000000..882ab371 --- /dev/null +++ b/src/db/models/impl/batch/pydantic/insert.py @@ -0,0 +1,17 @@ +from datetime import datetime + +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class BatchInsertModel(BulkInsertableModel): + strategy: str + status: BatchStatus + parameters: dict + user_id: int + date_generated: datetime + + @classmethod + def sa_model(cls) -> type[Batch]: + return Batch \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/__init__.py b/src/db/models/impl/flag/url_validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/flag/url_validated/enums.py b/src/db/models/impl/flag/url_validated/enums.py new file mode 100644 index 00000000..a0228ee1 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class ValidatedURLType(Enum): + DATA_SOURCE = "data source" + META_URL = "meta url" + NOT_RELEVANT = "not relevant" + INDIVIDUAL_RECORD = "individual record" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/pydantic.py b/src/db/models/impl/flag/url_validated/pydantic.py new file mode 100644 index 00000000..ccf3a110 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/pydantic.py @@ -0,0 +1,22 @@ +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + +type_ = type + +class FlagURLValidatedPydantic( + BulkInsertableModel, + BulkUpsertableModel +): + + url_id: int + type: ValidatedURLType + + @classmethod + def sa_model(cls) -> type_[FlagURLValidated]: + return FlagURLValidated + + @classmethod + def id_field(cls) -> str: + return "url_id" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py new file mode 100644 index 00000000..9d0528ab --- /dev/null +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -0,0 +1,25 @@ +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagURLValidated( + URLDependentMixin, + CreatedAtMixin, + UpdatedAtMixin, + Base, +): + __tablename__ = "flag_url_validated" + __table_args__ = ( + PrimaryKeyConstraint( + 'url_id', + ), + ) + + type = enum_column( + enum_type=ValidatedURLType, + name="validated_url_type", + ) diff --git a/src/db/models/impl/link/batch_url/__init__.py b/src/db/models/impl/link/batch_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/link/batch_url/pydantic.py b/src/db/models/impl/link/batch_url/pydantic.py new file mode 100644 index 00000000..143c57ce --- /dev/null +++ b/src/db/models/impl/link/batch_url/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkBatchURLPydantic(BulkInsertableModel): + batch_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[LinkBatchURL]: + return LinkBatchURL \ No newline at end of file diff --git a/src/db/models/impl/link/batch_url.py b/src/db/models/impl/link/batch_url/sqlalchemy.py similarity index 100% rename from src/db/models/impl/link/batch_url.py rename to src/db/models/impl/link/batch_url/sqlalchemy.py diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index f8d72065..875fa25f 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -7,7 +7,7 @@ class LinkURLAgency(URLDependentMixin, WithIDBase): - __tablename__ = "link_urls_agencies" + __tablename__ = "link_urls_agency" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/impl/url/core/pydantic/info.py b/src/db/models/impl/url/core/pydantic/info.py index 07df21fe..0985b3fc 100644 --- a/src/db/models/impl/url/core/pydantic/info.py +++ b/src/db/models/impl/url/core/pydantic/info.py @@ -12,7 +12,7 @@ class URLInfo(BaseModel): batch_id: int | None= None url: str collector_metadata: dict | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK updated_at: datetime.datetime | None = None created_at: datetime.datetime | None = None name: str | None = None diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index b893e9fa..18743f1b 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -16,6 +16,6 @@ def sa_model(cls) -> type[Base]: url: str collector_metadata: dict | None = None name: str | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK record_type: RecordType | None = None source: URLSource \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py index 6d6710c4..50fd5e03 100644 --- a/src/db/models/impl/url/suggestion/agency/auto.py +++ b/src/db/models/impl/url/suggestion/agency/auto.py @@ -8,7 +8,7 @@ class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): - __tablename__ = "automated_url_agency_suggestions" + __tablename__ = "url_auto_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) is_unknown = Column(Boolean, nullable=True) diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index f8dfa654..c84f54f1 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -18,6 +18,7 @@ from src.collectors.enums import URLStatus from src.db.constants import ALL_ANNOTATION_MODELS +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase @@ -67,6 +68,13 @@ async def build(self) -> Any: *annotation_exists_cases_all ) anno_exists_query = await self._outer_join_models(anno_exists_query) - anno_exists_query = anno_exists_query.where(URL.status == URLStatus.PENDING.value) + anno_exists_query = anno_exists_query.outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + anno_exists_query = anno_exists_query.where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None) + ) anno_exists_query = anno_exists_query.group_by(URL.id).cte("annotations_exist") self.query = anno_exists_query diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index f9bb2ef8..86983b5c 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -9,6 +9,7 @@ from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.core.get.recent_batch_summaries.pending_url.cte import PENDING_URL_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -24,9 +25,9 @@ def __init__( batch_id: int | None = None, ): super().__init__() + self.has_pending_urls = has_pending_urls self.url_counts_cte = URLCountsCTEQueryBuilder( page=page, - has_pending_urls=has_pending_urls, collector_type=collector_type, status=status, batch_id=batch_id, @@ -49,6 +50,14 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: builder.query, builder.get(count_labels.batch_id) == Batch.id, ) + if self.has_pending_urls is not None: + query = query.join( + PENDING_URL_CTE, + PENDING_URL_CTE.c.batch_id == Batch.id, + ).where( + PENDING_URL_CTE.c.has_pending_urls == self.has_pending_urls + ) + raw_results = await session.execute(query) summaries: list[BatchSummary] = [] diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py new file mode 100644 index 00000000..a0722229 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py @@ -0,0 +1,30 @@ +from sqlalchemy import select, func, case, and_ + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_URL_CTE = ( + select( + Batch.id.label("batch_id"), + case( + ( + and_( + func.count(LinkBatchURL.url_id) > func.count(FlagURLValidated.url_id), + ) + , True), + else_=False + ).label("has_pending_urls") + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id, + ) + .group_by( + Batch.id + ).cte("has_pending_urls") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 72a33336..afbd4477 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -1,15 +1,22 @@ -from typing import Optional - from sqlalchemy import Select, case, Label, and_, exists -from sqlalchemy.sql.functions import count, coalesce +from sqlalchemy.sql.functions import count, coalesce, func from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.all import ALL_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.duplicate import DUPLICATE_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.error import ERROR_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.not_relevant import NOT_RELEVANT_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.pending import PENDING_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.submitted import SUBMITTED_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -18,14 +25,12 @@ class URLCountsCTEQueryBuilder(QueryBuilderBase): def __init__( self, page: int = 1, - has_pending_urls: bool | None = None, collector_type: CollectorType | None = None, status: BatchStatus | None = None, batch_id: int | None = None ): super().__init__(URLCountsLabels()) self.page = page - self.has_pending_urls = has_pending_urls self.collector_type = collector_type self.status = status self.batch_id = batch_id @@ -33,31 +38,31 @@ def __init__( def get_core_query(self): labels: URLCountsLabels = self.labels - return ( + query = ( Select( Batch.id.label(labels.batch_id), - coalesce(count(URL.id), 0).label(labels.total), - self.count_case_url_status(URLStatus.PENDING, labels.pending), - self.count_case_url_status(URLStatus.SUBMITTED, labels.submitted), - self.count_case_url_status(URLStatus.NOT_RELEVANT, labels.not_relevant), - self.count_case_url_status(URLStatus.ERROR, labels.error), - self.count_case_url_status(URLStatus.DUPLICATE, labels.duplicate), + func.coalesce(DUPLICATE_CTE.count, 0).label(labels.duplicate), + func.coalesce(SUBMITTED_CTE.count, 0).label(labels.submitted), + func.coalesce(PENDING_CTE.count, 0).label(labels.pending), + func.coalesce(ALL_CTE.count, 0).label(labels.total), + func.coalesce(NOT_RELEVANT_CTE.count, 0).label(labels.not_relevant), + func.coalesce(ERROR_CTE.count, 0).label(labels.error), ) .select_from(Batch) - .outerjoin(LinkBatchURL) - .outerjoin( - URL - ) ) + for cte in [DUPLICATE_CTE, SUBMITTED_CTE, PENDING_CTE, ALL_CTE, NOT_RELEVANT_CTE, ERROR_CTE]: + query = query.outerjoin( + cte.cte, + Batch.id == cte.batch_id + ) + return query def build(self): query = self.get_core_query() - query = self.apply_pending_urls_filter(query) query = self.apply_collector_type_filter(query) query = self.apply_status_filter(query) query = self.apply_batch_id_filter(query) - query = query.group_by(Batch.id) query = add_page_offset(query, page=self.page) query = query.order_by(Batch.id) self.query = query.cte("url_counts") @@ -67,23 +72,6 @@ def apply_batch_id_filter(self, query: Select): return query return query.where(Batch.id == self.batch_id) - def apply_pending_urls_filter(self, query: Select): - if self.has_pending_urls is None: - return query - pending_url_subquery = ( - exists( - Select(URL).join(LinkBatchURL).where( - and_( - LinkBatchURL.batch_id == Batch.id, - URL.status == URLStatus.PENDING.value - ) - ) - ) - ).correlate(Batch) - if self.has_pending_urls: - return query.where(pending_url_subquery) - return query.where(~pending_url_subquery) - def apply_collector_type_filter(self, query: Select): if self.collector_type is None: return query @@ -93,18 +81,3 @@ def apply_status_filter(self, query: Select): if self.status is None: return query return query.where(Batch.status == self.status.value) - - @staticmethod - def count_case_url_status( - url_status: URLStatus, - label: str - ) -> Label: - return ( - coalesce( - count( - case( - (URL.status == url_status.value, 1) - ) - ) - , 0).label(label) - ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py new file mode 100644 index 00000000..5cab51cf --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py @@ -0,0 +1,20 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ALL_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("total_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .group_by( + Batch.id + ).cte("total_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py new file mode 100644 index 00000000..906dd49c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +DUPLICATE_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("duplicate_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.DUPLICATE + ) + .group_by( + Batch.id + ).cte("duplicate_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py new file mode 100644 index 00000000..b74020c4 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ERROR_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("error_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.ERROR + ) + .group_by( + Batch.id + ).cte("error_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py new file mode 100644 index 00000000..cbb55369 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +NOT_RELEVANT_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("not_relevant_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT + ) + .group_by( + Batch.id + ).cte("not_relevant_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py new file mode 100644 index 00000000..b7e4594c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py @@ -0,0 +1,33 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +PENDING_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("pending_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type.is_(None) + ) + .group_by( + Batch.id + ).cte("pending_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py new file mode 100644 index 00000000..5ab305cc --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py @@ -0,0 +1,32 @@ + + +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +SUBMITTED_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("submitted_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + URLDataSource, + URLDataSource.url_id == URL.id, + ) + .group_by( + Batch.id + ).cte("submitted_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py new file mode 100644 index 00000000..7f769c76 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py @@ -0,0 +1,18 @@ +from sqlalchemy import CTE, Column + + +class URLCountsCTEContainer: + + def __init__( + self, + cte: CTE + ): + self.cte = cte + + @property + def batch_id(self) -> Column: + return self.cte.columns[0] + + @property + def count(self) -> Column: + return self.cte.columns[1] diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 269dfced..37b3a560 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -44,7 +44,7 @@ async def build(self) -> Any: URL.id == self.url_id ) .where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ).cte("pending") ) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 45a281de..ec8e09bd 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -8,11 +8,10 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.task_url import LinkTaskURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.task.core import Task -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch @@ -96,7 +95,7 @@ def exclude_urls_with_agency_suggestions( def pending_urls_missing_miscellaneous_metadata_query() -> Select: query = select(URL).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, URL.name == None, URL.description == None, URLOptionalDataSourceMetadata.url_id == None diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py new file mode 100644 index 00000000..d974245e --- /dev/null +++ b/src/db/templates/requester.py @@ -0,0 +1,15 @@ +""" +A requester is a class that contains a session and provides methods for +performing database operations. +""" +from abc import ABC + +from sqlalchemy.ext.asyncio import AsyncSession + +import src.db.helpers.session.session_helper as sh + +class RequesterBase(ABC): + + def __init__(self, session: AsyncSession): + self.session = session + self.session_helper = sh \ No newline at end of file diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 47a24cac..b8227c7c 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -8,6 +8,7 @@ def switch_enum_type( new_enum_values, drop_old_enum=True, check_constraints_to_drop: list[str] = None, + conversion_mappings: dict[str, str] = None ): """ Switches an ENUM type in a PostgreSQL column by: @@ -21,6 +22,8 @@ def switch_enum_type( :param enum_name: Name of the ENUM type in PostgreSQL. :param new_enum_values: List of new ENUM values. :param drop_old_enum: Whether to drop the old ENUM type. + :param check_constraints_to_drop: List of check constraints to drop before switching the ENUM type. + :param conversion_mappings: Dictionary of old values to new values for the ENUM type. """ # 1. Drop check constraints that reference the enum @@ -38,7 +41,21 @@ def switch_enum_type( new_enum_type.create(op.get_bind()) # Alter the column type to use the new enum type - op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is None: + op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is not None: + case_when: str = "" + for old_value, new_value in conversion_mappings.items(): + case_when += f"WHEN '{old_value}' THEN '{new_value}'\n" + + op.execute(f""" + ALTER TABLE "{table_name}" + ALTER COLUMN "{column_name}" TYPE "{enum_name}" + USING CASE {column_name}::text + {case_when} + ELSE "{column_name}"::text + END::{enum_name}; + """) # Drop the old enum type if drop_old_enum: diff --git a/tests/automated/integration/api/batch/__init__.py b/tests/automated/integration/api/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/__init__.py b/tests/automated/integration/api/batch/summaries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py new file mode 100644 index 00000000..d91e1a8c --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -0,0 +1,95 @@ +import pytest + +from src.core.enums import BatchStatus +from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters + + +@pytest.mark.asyncio +async def test_get_batch_summaries(api_test_helper): + ath = api_test_helper + + batch_params = [ + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=1, + status=URLCreationEnum.OK + ), + TestURLCreationParameters( + count=2, + status=URLCreationEnum.SUBMITTED + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=4, + status=URLCreationEnum.NOT_RELEVANT + ), + TestURLCreationParameters( + count=3, + status=URLCreationEnum.ERROR + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=7, + status=URLCreationEnum.DUPLICATE + ), + TestURLCreationParameters( + count=1, + status=URLCreationEnum.SUBMITTED + ) + ] + ) + ] + + batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) + batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) + batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) + + batch_1_id = batch_1_creation_info.batch_id + batch_2_id = batch_2_creation_info.batch_id + batch_3_id = batch_3_creation_info.batch_id + + + response = ath.request_validator.get_batch_statuses() + results = response.results + + assert len(results) == 3 + + result_1 = results[0] + assert result_1.id == batch_1_id + assert result_1.status == BatchStatus.READY_TO_LABEL + counts_1 = result_1.url_counts + assert counts_1.total == 3 + assert counts_1.pending == 1 + assert counts_1.submitted == 2 + assert counts_1.not_relevant == 0 + assert counts_1.duplicate == 0 + assert counts_1.errored == 0 + + result_2 = results[1] + assert result_2.id == batch_2_id + counts_2 = result_2.url_counts + assert counts_2.total == 7 + assert counts_2.not_relevant == 4 + assert counts_2.errored == 3 + assert counts_2.pending == 3 + assert counts_2.submitted == 0 + assert counts_2.duplicate == 0 + + result_3 = results[2] + assert result_3.id == batch_3_id + counts_3 = result_3.url_counts + assert counts_3.total == 8 + assert counts_3.not_relevant == 0 + assert counts_3.errored == 0 + assert counts_3.pending == 7 + assert counts_3.submitted == 1 + assert counts_3.duplicate == 7 diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py new file mode 100644 index 00000000..e8d584e7 --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -0,0 +1,72 @@ +import pytest + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_get_batch_summaries_pending_url_filter(api_test_helper): + ath = api_test_helper + dbdc: DBDataCreator = ath.db_data_creator + + # Add an errored out batch + batch_error: int = await dbdc.create_batch(status=BatchStatus.ERROR) + + # Add a batch with pending urls + batch_pending = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=2, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLCreationEnum.OK + ) + + # Add a batch with submitted URLs + batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + submitted_url_ids: list[int] = await dbdc.create_submitted_urls(count=2) + await dbdc.create_batch_url_links( + batch_id=batch_submitted, + url_ids=submitted_url_ids + ) + + # Add an aborted batch + batch_aborted: int = await dbdc.create_batch(status=BatchStatus.ABORTED) + + # Add a batch with validated URLs + batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + validated_url_ids: list[int] = await dbdc.create_validated_urls( + count=2 + ) + await dbdc.create_batch_url_links( + batch_id=batch_validated, + url_ids=validated_url_ids + ) + + # Test filter for pending URLs and only retrieve the second batch + pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=True + ) + + assert len(pending_urls_results.results) == 1 + assert pending_urls_results.results[0].id == batch_pending.batch_id + + # Test filter without pending URLs and retrieve the other four batches + no_pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=False + ) + + assert len(no_pending_urls_results.results) == 4 + for result in no_pending_urls_results.results: + assert result.id in [ + batch_error, + batch_submitted, + batch_validated, + batch_aborted + ] + + # Test no filter for pending URLs and retrieve all batches + no_filter_results = ath.request_validator.get_batch_statuses() + + assert len(no_filter_results.results) == 5 diff --git a/tests/automated/integration/api/batch/test_batch.py b/tests/automated/integration/api/batch/test_batch.py new file mode 100644 index 00000000..86f35cfc --- /dev/null +++ b/tests/automated/integration/api/batch/test_batch.py @@ -0,0 +1,64 @@ +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.dtos.url.insert import InsertURLsInfo +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.core.enums import BatchStatus + + +def test_abort_batch(api_test_helper): + ath = api_test_helper + + dto = ExampleInputDTO( + sleep_time=1 + ) + + batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] + + response = ath.request_validator.abort_batch(batch_id=batch_id) + + assert response.message == "Batch aborted." + + bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) + + assert bi.status == BatchStatus.ABORTED + +def test_get_batch_urls(api_test_helper): + + # Insert batch and urls into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) + assert len(response.urls) == 100 + # Check that the first url corresponds to the first url inserted + assert response.urls[0].url == iui.url_mappings[0].url + # Check that the last url corresponds to the 100th url inserted + assert response.urls[-1].url == iui.url_mappings[99].url + + + # Check that a more limited set of urls exist + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) + assert len(response.urls) == 1 + # Check that this url corresponds to the last url inserted + assert response.urls[0].url == iui.url_mappings[-1].url + +def test_get_duplicate_urls(api_test_helper): + + # Insert batch and url into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + # Get a list of all url ids + url_ids = [url.url_id for url in iui.url_mappings] + + # Create a second batch which will be associated with the duplicates + dup_batch_id = ath.db_data_creator.batch() + + # Insert duplicate urls into database + ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) + assert len(response.duplicates) == 100 + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) + assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/example_collector/test_happy_path.py b/tests/automated/integration/api/example_collector/test_happy_path.py index bbb52789..d580f546 100644 --- a/tests/automated/integration/api/example_collector/test_happy_path.py +++ b/tests/automated/integration/api/example_collector/test_happy_path.py @@ -6,7 +6,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 084762b9..3121dd4e 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -2,44 +2,63 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ + create_batch_url_links, create_validated_flags +from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio -async def test_get_batches_aggregated_metrics(api_test_helper): +async def test_get_batches_aggregated_metrics( + api_test_helper, + wiped_database +): ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() # Create successful batches with URLs of different statuses - all_params = [] for i in range(3): - params = TestBatchCreationParameters( + batch_id = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ) - ] ) - all_params.append(params) - + url_ids_error: list[int] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + url_ids_ok: list[int] = await create_urls( + adb_client=adb_client, + status=URLStatus.OK, + count=11, + ) + url_ids_all: list[int] = url_ids_error + url_ids_ok + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id, + url_ids=url_ids_all, + ) + urls_submitted: list[int] = url_ids_all[:2] + urls_not_relevant: list[int] = url_ids_all[2:5] + urls_validated: list[int] = url_ids_all[5:10] + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_validated + urls_submitted, + validation_type=ValidatedURLType.DATA_SOURCE, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_not_relevant, + validation_type=ValidatedURLType.NOT_RELEVANT, + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=urls_submitted, + ) + all_params = [] # Create failed batches for i in range(2): params = TestBatchCreationParameters( @@ -66,8 +85,8 @@ async def test_get_batches_aggregated_metrics(api_test_helper): assert inner_dto_manual.count_urls == 45 assert inner_dto_manual.count_successful_batches == 3 assert inner_dto_manual.count_failed_batches == 0 - assert inner_dto_manual.count_urls_pending == 3 + assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 assert inner_dto_manual.count_urls_errors == 12 - assert inner_dto_manual.count_urls_validated == 15 + assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index 0cce8740..a75979ea 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,79 +1,98 @@ +from datetime import datetime, timedelta + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ + create_url_data_sources @pytest.mark.asyncio async def test_get_batches_breakdown_metrics(api_test_helper): # Create a different batch for each month, with different URLs - today = pendulum.parse('2021-01-01') + today = datetime.now() ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() - batch_1_params = TestBatchCreationParameters( + batch_id_1 = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.EXAMPLE, - outcome=BatchStatus.ERROR, - created_at=today.subtract(weeks=1), + url_ids_1: list[int] = await create_urls( + adb_client=adb_client, + count=3, + ) + await create_batch_url_links(adb_client=adb_client, batch_id=batch_id_1, url_ids=url_ids_1) + await create_validated_flags( + adb_client=adb_client, + url_ids=url_ids_1[:2], + validation_type=ValidatedURLType.DATA_SOURCE + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=url_ids_1[:2], ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) - batch_3_params = TestBatchCreationParameters( + + batch_id_2 = await create_batch( + adb_client=adb_client, + status=BatchStatus.ERROR, + date_generated=today - timedelta(days=7), + ) + + batch_id_3 = await create_batch( + adb_client=adb_client, strategy=CollectorType.AUTO_GOOGLER, - created_at=today.subtract(weeks=2), - urls=[ - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + date_generated=today - timedelta(days=14) ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + error_url_ids: list[int] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + validated_url_ids: list[int] = await create_urls( + adb_client=adb_client, + count=8, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[:3], + validation_type=ValidatedURLType.NOT_RELEVANT, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[4:9], + validation_type=ValidatedURLType.DATA_SOURCE, + ) + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id_3, + url_ids=error_url_ids + validated_url_ids, + ) + dto_1 = await ath.request_validator.get_batches_breakdown_metrics( page=1 ) assert len(dto_1.batches) == 3 dto_batch_1 = dto_1.batches[2] - assert dto_batch_1.batch_id == batch_1.batch_id + assert dto_batch_1.batch_id == batch_id_1 assert dto_batch_1.strategy == CollectorType.MANUAL assert dto_batch_1.status == BatchStatus.READY_TO_LABEL - assert pendulum.instance(dto_batch_1.created_at) > today assert dto_batch_1.count_url_total == 3 assert dto_batch_1.count_url_pending == 1 assert dto_batch_1.count_url_submitted == 2 assert dto_batch_1.count_url_rejected == 0 assert dto_batch_1.count_url_error == 0 - assert dto_batch_1.count_url_validated == 0 + assert dto_batch_1.count_url_validated == 2 dto_batch_2 = dto_1.batches[1] - assert dto_batch_2.batch_id == batch_2.batch_id + assert dto_batch_2.batch_id == batch_id_2 assert dto_batch_2.status == BatchStatus.ERROR assert dto_batch_2.strategy == CollectorType.EXAMPLE - assert pendulum.instance(dto_batch_2.created_at) == today.subtract(weeks=1) assert dto_batch_2.count_url_total == 0 assert dto_batch_2.count_url_submitted == 0 assert dto_batch_2.count_url_pending == 0 @@ -82,16 +101,15 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_2.count_url_validated == 0 dto_batch_3 = dto_1.batches[0] - assert dto_batch_3.batch_id == batch_3.batch_id + assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert pendulum.instance(dto_batch_3.created_at) == today.subtract(weeks=2) assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 0 + assert dto_batch_3.count_url_pending == 5 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 assert dto_batch_3.count_url_error == 4 - assert dto_batch_3.count_url_validated == 5 + assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( page=2 diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index a6807a23..d39d0640 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -3,9 +3,12 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import SuggestedStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio @@ -14,29 +17,21 @@ async def test_get_backlog_metrics(api_test_helper): ath = api_test_helper adb_client = ath.adb_client() + ddc: DBDataCreator = ath.db_data_creator # Populate the backlog table and test that backlog metrics returned on a monthly basis # Ensure that multiple days in each month are added to the backlog table, with different values - - batch_1_params = TestBatchCreationParameters( - strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] + batch_1_id: int = await ddc.create_batch() + url_ids_1: list[int] = await ddc.create_urls(count=3) + await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) + submitted_url_ids_1: list[int] = url_ids_1[:2] + await ddc.create_validated_flags( + url_ids=submitted_url_ids_1, + validation_type=ValidatedURLType.DATA_SOURCE ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + await ddc.create_url_data_sources(url_ids=submitted_url_ids_1) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=3).naive() @@ -46,23 +41,18 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=2, days=3).naive() ) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - ] + batch_2_id: int = await ddc.create_batch() + not_relevant_url_ids_2: list[int] = await ddc.create_urls(count=6) + await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) + await ddc.create_validated_flags( + url_ids=not_relevant_url_ids_2[:4], + validation_type=ValidatedURLType.NOT_RELEVANT + ) + error_url_ids_2: list[int] = await ddc.create_urls( + status=URLStatus.ERROR, + count=2 ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() @@ -72,23 +62,14 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=1, days=4).naive() ) - batch_3_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + batch_3_id: int = await ddc.create_batch() + url_ids_3: list[int] = await ddc.create_urls(count=12) + await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) + await ddc.create_validated_flags( + url_ids=url_ids_3[:5], + validation_type=ValidatedURLType.DATA_SOURCE ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + await adb_client.populate_backlog_snapshot( dt=today.subtract(months=1).naive() @@ -100,5 +81,5 @@ async def test_get_backlog_metrics(api_test_helper): # Test that the count closest to the beginning of the month is returned for each month assert dto.entries[0].count_pending_total == 1 - assert dto.entries[1].count_pending_total == 5 - assert dto.entries[2].count_pending_total == 12 + assert dto.entries[1].count_pending_total == 3 + assert dto.entries[2].count_pending_total == 10 diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index c8957952..49f63cf4 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,75 +1,66 @@ +from datetime import datetime, timedelta, timezone + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_get_urls_aggregated_metrics(api_test_helper): ath = api_test_helper - today = pendulum.parse('2021-01-01') + today = datetime.now() + + ddc: DBDataCreator = ath.db_data_creator batch_0_params = TestBatchCreationParameters( strategy=CollectorType.MANUAL, - created_at=today.subtract(days=1), + created_at=today - timedelta(days=1), urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, ), ] ) - batch_0 = await ath.db_data_creator.batch_v2(batch_0_params) - oldest_url_id = batch_0.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id - + batch_0: int = await ddc.create_batch( + strategy=CollectorType.MANUAL, + date_generated=today - timedelta(days=1) + ) + url_ids_0: list[int] = await ddc.create_urls(batch_id=batch_0) + oldest_url_id: int = url_ids_0[0] - batch_1_params = TestBatchCreationParameters( + batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + url_ids_1_ok: list[int] = await ddc.create_urls(batch_id=batch_1, count=1) + url_ids_1_submitted: list[int] = await ddc.create_submitted_urls(count=2) + await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) - batch_2_params = TestBatchCreationParameters( + batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=1, - status=URLStatus.VALIDATED - ), - TestURLCreationParameters( - count=5, - status=URLStatus.NOT_RELEVANT - ), - ] ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + url_ids_2_ok: list[int] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_ids_2_error: list[int] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_ids_2_validated: list[int] = await ddc.create_validated_urls(count=1, validation_type=ValidatedURLType.DATA_SOURCE) + url_ids_2_not_relevant: list[int] = await ddc.create_validated_urls(count=5, validation_type=ValidatedURLType.NOT_RELEVANT) + await ddc.create_batch_url_links( + url_ids=url_ids_2_validated + url_ids_2_not_relevant, + batch_id=batch_2 + ) + + dto = await ath.request_validator.get_urls_aggregated_metrics() assert dto.oldest_pending_url_id == oldest_url_id - assert dto.oldest_pending_url_created_at == today.subtract(days=1).in_timezone('UTC').naive() - assert dto.count_urls_pending == 6 assert dto.count_urls_rejected == 5 assert dto.count_urls_errors == 2 - assert dto.count_urls_validated == 1 + assert dto.count_urls_validated == 8 assert dto.count_urls_submitted == 2 assert dto.count_urls_total == 16 diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index e81d6ec7..02f1aae2 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -6,6 +6,7 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -27,14 +28,14 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.NOT_RELEVANT ) ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -44,7 +45,7 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.CALLS_FOR_SERVICE @@ -60,15 +61,15 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.INCARCERATION_RECORDS, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index 71e00e51..cbd30f8b 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -18,11 +19,11 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING + status=URLCreationEnum.OK ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -32,7 +33,7 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ) ], created_at=today.subtract(weeks=1), @@ -44,15 +45,15 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.VALIDATED + status=URLCreationEnum.VALIDATED ), ] ) diff --git a/tests/automated/integration/api/review/conftest.py b/tests/automated/integration/api/review/conftest.py index e4345821..59d76930 100644 --- a/tests/automated/integration/api/review/conftest.py +++ b/tests/automated/integration/api/review/conftest.py @@ -5,32 +5,18 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @pytest_asyncio.fixture async def batch_url_creation_info(db_data_creator): - simple_parameter_statuses = [ - URLStatus.VALIDATED, - URLStatus.SUBMITTED, - URLStatus.INDIVIDUAL_RECORD, - URLStatus.NOT_RELEVANT, - URLStatus.ERROR, - URLStatus.DUPLICATE, - URLStatus.NOT_FOUND - ] - simple_parameters = [ - TestURLCreationParameters( - status=status - ) for status in simple_parameter_statuses - ] parameters = TestBatchCreationParameters( urls=[ - *simple_parameters, TestURLCreationParameters( count=2, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.ARREST_RECORDS, diff --git a/tests/automated/integration/api/review/rejection/test_individual_record.py b/tests/automated/integration/api/review/rejection/test_individual_record.py index 6e81d378..ec96819a 100644 --- a/tests/automated/integration/api/review/rejection/test_individual_record.py +++ b/tests/automated/integration/api/review/rejection/test_individual_record.py @@ -2,14 +2,21 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test +from tests.helpers.api_test_helper import APITestHelper @pytest.mark.asyncio -async def test_rejection_individual_record(api_test_helper): +async def test_rejection_individual_record(api_test_helper: APITestHelper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.INDIVIDUAL_RECORD, - url_status=URLStatus.INDIVIDUAL_RECORD + url_status=URLStatus.OK ) + # Get FlagURLValidated and confirm Individual Record + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == ValidatedURLType.INDIVIDUAL_RECORD + diff --git a/tests/automated/integration/api/review/rejection/test_not_relevant.py b/tests/automated/integration/api/review/rejection/test_not_relevant.py index 1ad2847f..7b6154e1 100644 --- a/tests/automated/integration/api/review/rejection/test_not_relevant.py +++ b/tests/automated/integration/api/review/rejection/test_not_relevant.py @@ -2,6 +2,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test @@ -10,5 +12,9 @@ async def test_rejection_not_relevant(api_test_helper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.NOT_RELEVANT, - url_status=URLStatus.NOT_RELEVANT + url_status=URLStatus.OK ) + + # Get FlagURLValidated and confirm Not Relevant + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == ValidatedURLType.NOT_RELEVANT \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index bfa126b1..fab8a1a0 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,6 +6,8 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -55,7 +57,7 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -76,3 +78,8 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): for agency in agencies: if agency.agency_id == additional_agency: assert agency.name == PLACEHOLDER_AGENCY_NAME + + # Confirm presence of FlagURLValidated + flag_url_validated = await adb_client.get_all(FlagURLValidated) + assert len(flag_url_validated) == 1 + assert flag_url_validated[0].type == ValidatedURLType.DATA_SOURCE \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_batch_filtering.py b/tests/automated/integration/api/review/test_batch_filtering.py index 2e8aa63c..820dc9c0 100644 --- a/tests/automated/integration/api/review/test_batch_filtering.py +++ b/tests/automated/integration/api/review/test_batch_filtering.py @@ -1,21 +1,35 @@ import pytest +from src.collectors.enums import URLStatus +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + @pytest.mark.asyncio async def test_batch_filtering( - batch_url_creation_info, + batch_url_creation_info: BatchURLCreationInfo, api_test_helper ): ath = api_test_helper rv = ath.request_validator + dbdc: DBDataCreator = ath.db_data_creator + + batch_id: int = batch_url_creation_info.batch_id + + validated_url_ids: list[int] = await dbdc.create_validated_urls(count=4) + await dbdc.create_batch_url_links( + url_ids=validated_url_ids, + batch_id=batch_id + ) + # Receive null batch info if batch id not provided outer_result_no_batch_info = await rv.review_next_source() assert outer_result_no_batch_info.next_source.batch_info is None # Get batch info if batch id is provided outer_result = await ath.request_validator.review_next_source( - batch_id=batch_url_creation_info.batch_id + batch_id=batch_id ) assert outer_result.remaining == 2 batch_info = outer_result.next_source.batch_info diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py deleted file mode 100644 index 4dd21a49..00000000 --- a/tests/automated/integration/api/test_batch.py +++ /dev/null @@ -1,237 +0,0 @@ -import pytest - -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.impl.example.dtos.input import ExampleInputDTO -from src.collectors.enums import CollectorType, URLStatus -from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters - - -@pytest.mark.asyncio -async def test_get_batch_summaries(api_test_helper): - ath = api_test_helper - - batch_params = [ - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=3, - status=URLStatus.ERROR - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.DUPLICATE - ), - TestURLCreationParameters( - count=1, - status=URLStatus.SUBMITTED - ) - ] - ) - ] - - batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) - batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) - batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) - - batch_1_id = batch_1_creation_info.batch_id - batch_2_id = batch_2_creation_info.batch_id - batch_3_id = batch_3_creation_info.batch_id - - - response = ath.request_validator.get_batch_statuses() - results = response.results - - assert len(results) == 3 - - result_1 = results[0] - assert result_1.id == batch_1_id - assert result_1.status == BatchStatus.READY_TO_LABEL - counts_1 = result_1.url_counts - assert counts_1.total == 3 - assert counts_1.pending == 1 - assert counts_1.submitted == 2 - assert counts_1.not_relevant == 0 - assert counts_1.duplicate == 0 - assert counts_1.errored == 0 - - result_2 = results[1] - assert result_2.id == batch_2_id - counts_2 = result_2.url_counts - assert counts_2.total == 7 - assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 0 - assert counts_2.submitted == 0 - assert counts_2.duplicate == 0 - - result_3 = results[2] - assert result_3.id == batch_3_id - counts_3 = result_3.url_counts - assert counts_3.total == 8 - assert counts_3.not_relevant == 0 - assert counts_3.errored == 0 - assert counts_3.pending == 0 - assert counts_3.submitted == 1 - assert counts_3.duplicate == 7 - - - - - - -@pytest.mark.asyncio -async def test_get_batch_summaries_pending_url_filter(api_test_helper): - ath = api_test_helper - - # Add an errored out batch - batch_error = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ERROR - ) - - # Add a batch with pending urls - batch_pending = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.PENDING - ) - - # Add a batch with submitted URLs - batch_submitted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.SUBMITTED - ) - - # Add an aborted batch - batch_aborted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ABORTED - ) - - # Add a batch with validated URLs - batch_validated = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.VALIDATED - ) - - # Test filter for pending URLs and only retrieve the second batch - pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=True - ) - - assert len(pending_urls_results.results) == 1 - assert pending_urls_results.results[0].id == batch_pending.batch_id - - # Test filter without pending URLs and retrieve the other four batches - no_pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=False - ) - - assert len(no_pending_urls_results.results) == 4 - for result in no_pending_urls_results.results: - assert result.id in [ - batch_error.batch_id, - batch_submitted.batch_id, - batch_validated.batch_id, - batch_aborted.batch_id - ] - - # Test no filter for pending URLs and retrieve all batches - no_filter_results = ath.request_validator.get_batch_statuses() - - assert len(no_filter_results.results) == 5 - - - - -def test_abort_batch(api_test_helper): - ath = api_test_helper - - dto = ExampleInputDTO( - sleep_time=1 - ) - - batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] - - response = ath.request_validator.abort_batch(batch_id=batch_id) - - assert response.message == "Batch aborted." - - bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) - - assert bi.status == BatchStatus.ABORTED - -def test_get_batch_urls(api_test_helper): - - # Insert batch and urls into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) - assert len(response.urls) == 100 - # Check that the first url corresponds to the first url inserted - assert response.urls[0].url == iui.url_mappings[0].url - # Check that the last url corresponds to the 100th url inserted - assert response.urls[-1].url == iui.url_mappings[99].url - - - # Check that a more limited set of urls exist - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) - assert len(response.urls) == 1 - # Check that this url corresponds to the last url inserted - assert response.urls[0].url == iui.url_mappings[-1].url - -def test_get_duplicate_urls(api_test_helper): - - # Insert batch and url into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - # Get a list of all url ids - url_ids = [url.url_id for url in iui.url_mappings] - - # Create a second batch which will be associated with the duplicates - dup_batch_id = ath.db_data_creator.batch() - - # Insert duplicate urls into database - ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) - assert len(response.duplicates) == 100 - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) - assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9b3fb326..1d2e595d 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,7 +2,7 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 78578c6b..f2d73f00 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,8 +1,8 @@ import pytest from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py new file mode 100644 index 00000000..81bef537 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py @@ -0,0 +1,30 @@ +from unittest.mock import AsyncMock + +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def check_results_called( + operator: PushToHuggingFaceTaskOperator, + expected_outputs: list[GetForLoadingToHuggingFaceOutput] +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + outputs: list[GetForLoadingToHuggingFaceOutput] = mock_push.call_args.args[0] + outputs = sorted(outputs, key=lambda x: x.url_id) + expected_outputs = sorted(expected_outputs, key=lambda x: x.url_id) + for output, expected_output in zip(outputs, expected_outputs): + assert output.url_id == expected_output.url_id + assert output.url == expected_output.url + assert output.relevant == expected_output.relevant, f"Expected {expected_output.relevant}, got {output.relevant}" + assert output.record_type_fine == expected_output.record_type_fine + assert output.record_type_coarse == expected_output.record_type_coarse + assert output.html == expected_output.html + + +def check_not_called( + operator: PushToHuggingFaceTaskOperator, +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + mock_push.assert_not_called() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py index 64a16f9f..e7a9a69b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py @@ -1,71 +1,30 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry \ - import TestPushToHuggingFaceURLSetupEntry as Entry -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput as Output -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ - TestPushToHuggingFaceURLSetupEntryInput as Input +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def get_test_url(i: int) -> str: + return f"www.testPushToHuggingFaceURLSetupEntry.com/{i}" + +def get_test_html(i: int) -> str: + return f"
Test Push to Hugging Face URL Setup Entry {i}
" + +def generate_expected_outputs( + url_ids: list[int], + relevant: bool, + record_type_fine: RecordType, + record_type_coarse: RecordTypeCoarse +) -> list[GetForLoadingToHuggingFaceOutput]: + results: list[GetForLoadingToHuggingFaceOutput] = [] + for i in range(2): + output = GetForLoadingToHuggingFaceOutput( + url_id=url_ids[i], + url=get_test_url(i), + relevant=relevant, + record_type_fine=record_type_fine, + record_type_coarse=record_type_coarse, + html=get_test_html(i) + ) + results.append(output) + return results -ENTRIES = [ - # Because pending, should not be picked up - Entry( - input=Input( - status=URLStatus.PENDING, - has_html_content=True, - record_type=RecordType.INCARCERATION_RECORDS - ), - expected_output=Output( - picked_up=False, - ) - ), - # Because no html content, should not be picked up - Entry( - input=Input( - status=URLStatus.SUBMITTED, - has_html_content=False, - record_type=RecordType.RECORDS_REQUEST_INFO - ), - expected_output=Output( - picked_up=False, - ) - ), - # Remainder should be picked up - Entry( - input=Input( - status=URLStatus.VALIDATED, - has_html_content=True, - record_type=RecordType.RECORDS_REQUEST_INFO - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.AGENCY_PUBLISHED_RESOURCES, - relevant=True - ) - ), - Entry( - input=Input( - status=URLStatus.SUBMITTED, - has_html_content=True, - record_type=RecordType.INCARCERATION_RECORDS - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.JAILS_AND_COURTS, - relevant=True - ) - ), - Entry( - input=Input( - status=URLStatus.NOT_RELEVANT, - has_html_content=True, - record_type=None - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.NOT_RELEVANT, - relevant=False - ) - ), -] diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py new file mode 100644 index 00000000..0bb8cc87 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class PushToHuggingFaceTestSetupStatusEnum(Enum): + NOT_VALIDATED = "NOT_VALIDATED" + NOT_RELEVANT = "NOT_RELEVANT" + DATA_SOURCE = "DATA_SOURCE" diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py new file mode 100644 index 00000000..bbb40067 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py @@ -0,0 +1,16 @@ +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.setup import \ + SetupTestPushToHuggingFaceEntryQueryBuilder + + +async def setup_urls( + dbc: AsyncDatabaseClient, + inp: TestPushToHuggingFaceURLSetupEntryInput +) -> list[int]: + # Set up 2 URLs + builder = SetupTestPushToHuggingFaceEntryQueryBuilder(inp) + return await dbc.run_query_builder(builder) + + diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py deleted file mode 100644 index d6438472..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py +++ /dev/null @@ -1,43 +0,0 @@ -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from src.db.client.async_ import AsyncDatabaseClient -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.record import \ - TestPushToHuggingFaceRecordSetupRecord as Record, TestPushToHuggingFaceRecordSetupRecord -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.setup import \ - SetupTestPushToHuggingFaceEntryQueryBuilder - - -class PushToHuggingFaceTestSetupManager: - - def __init__(self, adb_client: AsyncDatabaseClient): - self.adb_client = adb_client - self.entries = ENTRIES - # Connects a URL ID to the expectation that it will be picked up - self._id_to_record: dict[int, TestPushToHuggingFaceRecordSetupRecord] = {} - - async def setup(self) -> None: - records: list[Record] = await self.adb_client.run_query_builder( - SetupTestPushToHuggingFaceEntryQueryBuilder(self.entries) - ) - for record in records: - if not record.expected_output.picked_up: - continue - self._id_to_record[record.url_id] = record - - def check_results(self, outputs: list[GetForLoadingToHuggingFaceOutput]) -> None: - # Check that both expected and actual results are same length - length_expected = len(self._id_to_record.keys()) - length_actual = len(outputs) - assert length_expected == length_actual, f"Expected {length_expected} results, got {length_actual}" - - # Check attributes of each URL match what is expected - for output in outputs: - url_id = output.url_id - record = self._id_to_record[url_id] - - expected_output = record.expected_output - assert output.relevant == expected_output.relevant - assert output.record_type_coarse == expected_output.coarse_record_type, \ - f"Expected {expected_output.coarse_record_type} but got {output.record_type_coarse}" - assert output.record_type_fine == record.record_type_fine - diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py deleted file mode 100644 index 16bb74aa..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ - TestPushToHuggingFaceURLSetupEntryInput -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput - - -class TestPushToHuggingFaceURLSetupEntry(BaseModel): - input: TestPushToHuggingFaceURLSetupEntryInput - expected_output: TestPushToHuggingFaceURLSetupExpectedOutput - diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py index b5128375..2bdf21a5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py @@ -1,10 +1,11 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus from src.core.enums import RecordType +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum class TestPushToHuggingFaceURLSetupEntryInput(BaseModel): - status: URLStatus + status: PushToHuggingFaceTestSetupStatusEnum record_type: RecordType | None has_html_content: bool diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py deleted file mode 100644 index 736bd97e..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import Self - -from pydantic import BaseModel, model_validator - -from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse - - -class TestPushToHuggingFaceURLSetupExpectedOutput(BaseModel): - picked_up: bool - relevant: bool | None = None - coarse_record_type: RecordTypeCoarse | None = None - - @model_validator(mode='after') - def validate_coarse_record_type_and_relevant(self) -> Self: - if not self.picked_up: - return self - if self.coarse_record_type is None: - raise ValueError('Coarse record type should be provided if picked up') - if self.relevant is None: - raise ValueError('Relevant should be provided if picked up') - return self diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py deleted file mode 100644 index 4ce15770..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput - - -class TestPushToHuggingFaceRecordSetupRecord(BaseModel): - expected_output: TestPushToHuggingFaceURLSetupExpectedOutput - record_type_fine: RecordType | None - url_id: int \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py new file mode 100644 index 00000000..d0f2fea0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py @@ -0,0 +1,14 @@ +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum + +def convert_test_status_to_validated_status( + status: PushToHuggingFaceTestSetupStatusEnum +) -> ValidatedURLType: + match status: + case PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE: + return ValidatedURLType.DATA_SOURCE + case PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT: + return ValidatedURLType.NOT_RELEVANT + case _: + raise ValueError(f"Invalid test status for function: {status}") \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 8e01c86b..05b829df 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -1,57 +1,66 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry import \ - TestPushToHuggingFaceURLSetupEntry as Entry -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.record import \ - TestPushToHuggingFaceRecordSetupRecord as Record +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import get_test_url, get_test_html +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.convert import \ + convert_test_status_to_validated_status class SetupTestPushToHuggingFaceEntryQueryBuilder(QueryBuilderBase): def __init__( self, - entries: list[Entry] + inp: TestPushToHuggingFaceURLSetupEntryInput ): super().__init__() - self.entries = entries + self.inp = inp - async def run(self, session: AsyncSession) -> list[Record]: - records = [] - for idx, entry in enumerate(self.entries): - if idx % 2 == 0: + async def run(self, session: AsyncSession) -> list[int]: + url_ids: list[int] = [] + for i in range(2): + if i % 2 == 0: name = "Test Push to Hugging Face URL Setup Entry" description = "This is a test push to Hugging Face URL setup entry" else: name = None description = None - inp = entry.input url = URL( - url=f"www.testPushToHuggingFaceURLSetupEntry.com/{idx}", - status=inp.status, + url=get_test_url(i), + status=URLStatus.OK, name=name, description=description, - record_type=inp.record_type, + record_type=self.inp.record_type, source=URLSource.COLLECTOR ) session.add(url) await session.flush() - if entry.input.has_html_content: + url_ids.append(url.id) + if self.inp.status in ( + PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT + ): + flag = FlagURLValidated( + url_id=url.id, + type=convert_test_status_to_validated_status(self.inp.status), + ) + session.add(flag) + + if self.inp.has_html_content: compressed_html = URLCompressedHTML( url_id=url.id, - compressed_html=compress_html(f"
Test Push to Hugging Face URL Setup Entry {idx}
"), + compressed_html=compress_html(get_test_html(i)), ) session.add(compressed_html) - record = Record( - url_id=url.id, - expected_output=entry.expected_output, - record_type_fine=inp.record_type - ) - records.append(record) - return records + return url_ids diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py deleted file mode 100644 index d3c3e056..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py +++ /dev/null @@ -1,42 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest - -from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.manager import PushToHuggingFaceTestSetupManager -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_happy_path( - operator: PushToHuggingFaceTaskOperator, - db_data_creator: DBDataCreator -): - hf_client = operator.hf_client - push_function: AsyncMock = hf_client.push_data_sources_raw_to_hub - - # Check, prior to adding URLs, that task does not run - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_not_called() - - # Add URLs - manager = PushToHuggingFaceTestSetupManager(adb_client=db_data_creator.adb_client) - await manager.setup() - - # Run task - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_called_once() - - call_args: list[GetForLoadingToHuggingFaceOutput] = push_function.call_args.args[0] - - # Check for calls to HF Client - manager.check_results(call_args) - - # Test that after update, running again yields no results - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_called_once() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py new file mode 100644 index 00000000..25c4d09d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_not_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_no_html_content_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.ACCIDENT_REPORTS + + # Add URLs with no html content + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + has_html_content=False + ) + _ = await setup_urls(adb_client_test, inp=inp) + + # Confirm task does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task as though it did meet prerequisites + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task still does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm no URLs were picked up + check_not_called(operator) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py new file mode 100644 index 00000000..b4abc0ee --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py @@ -0,0 +1,58 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_not_relevant_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.COMPLAINTS_AND_MISCONDUCT + rt_coarse = RecordTypeCoarse.INFO_ABOUT_OFFICERS + + # Add URLs with not relevant status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT, + has_html_content=True + ) + url_ids: list[int] = await setup_urls(adb_client_test, inp=inp) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm expected URLs picked up + expected_outputs: list[GetForLoadingToHuggingFaceOutput] = generate_expected_outputs( + url_ids=url_ids, + relevant=False, + record_type_fine=record_type, + record_type_coarse=rt_coarse, + ) + check_results_called( + operator=operator, + expected_outputs=expected_outputs, + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py new file mode 100644 index 00000000..8fa07928 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py @@ -0,0 +1,44 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_not_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_not_validated_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.COURT_CASES + + # Add URLs with pending status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.NOT_VALIDATED, + has_html_content=True + ) + _ = await setup_urls(adb_client_test, inp=inp) + + # Confirm task doesn't meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task as though it did meet prerequisites + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task still doesn't meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm pending URL not picked up + check_not_called(operator) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py new file mode 100644 index 00000000..4ca89aa1 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py @@ -0,0 +1,60 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_validated_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.GEOGRAPHIC + rt_coarse = RecordTypeCoarse.INFO_ABOUT_AGENCIES + + # Add URLs with validated status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + has_html_content=True + ) + url_ids: list[int] = await setup_urls(adb_client_test, inp=inp) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm URLs picked up + # Confirm expected URLs picked up + expected_outputs: list[GetForLoadingToHuggingFaceOutput] = generate_expected_outputs( + url_ids=url_ids, + relevant=True, + record_type_fine=record_type, + record_type_coarse=rt_coarse, + ) + check_results_called( + operator=operator, + expected_outputs=expected_outputs, + ) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py index 12428d7d..dcc1fc23 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import timedelta, datetime from sqlalchemy import select, cast, func, TIMESTAMP @@ -9,14 +9,9 @@ async def check_sync_concluded( db_client: AsyncDatabaseClient, + current_db_datetime: datetime, check_updated_at: bool = True -): - - current_db_datetime = await db_client.scalar( - select( - cast(func.now(), TIMESTAMP) - ) - ) +) -> None: sync_state_results = await db_client.scalar( select( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py index 44239db8..e91461ea 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py @@ -1,12 +1,16 @@ +from datetime import datetime + import pytest_asyncio from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.sqlalchemy import Agency from src.external.pdap.client import PDAPClient from tests.helpers.data_creator.core import DBDataCreator @pytest_asyncio.fixture -async def test_operator( +async def operator( db_data_creator: DBDataCreator, mock_pdap_client: PDAPClient ) -> SyncDataSourcesTaskOperator: @@ -14,3 +18,30 @@ async def test_operator( adb_client=db_data_creator.adb_client, pdap_client=mock_pdap_client ) + +@pytest_asyncio.fixture +async def current_db_time( + adb_client_test: AsyncDatabaseClient +) -> datetime: + return (await adb_client_test.get_current_database_time()).replace(tzinfo=None) + + +@pytest_asyncio.fixture +async def agency_ids( + adb_client_test: AsyncDatabaseClient +) -> list[int]: + """Creates and returns the ids of 4 agencies""" + agencies: list[Agency] = [] + agency_ids: list[int] = [] + for i in range(4): + agency = Agency( + agency_id=i, + name=f"Test Agency {i}", + state="test_state", + county="test_county", + locality="test_locality" + ) + agency_ids.append(i) + agencies.append(agency) + await adb_client_test.add_all(agencies) + return agency_ids diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py deleted file mode 100644 index 4007c38d..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py +++ /dev/null @@ -1,42 +0,0 @@ -from collections import defaultdict - -from src.db.models.impl.link.url_agency_.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo - - -class URLExistenceChecker: - - def __init__( - self, - responses: list[DataSourcesSyncResponseInfo], - url_ds_links: list[URLDataSource], - url_agency_links: list[LinkURLAgency] - ): - self._ds_id_response_dict: dict[int, DataSourcesSyncResponseInnerInfo] = {} - for response in responses: - for data_source in response.data_sources: - self._ds_id_response_dict[data_source.id] = data_source - self._ds_id_url_link_dict = {} - for link in url_ds_links: - self._ds_id_url_link_dict[link.data_source_id] = link.url_id - self._url_id_agency_link_dict = defaultdict(list) - for link in url_agency_links: - self._url_id_agency_link_dict[link.url_id].append(link.agency_id) - - - def check(self, url: URL): - ds_id = self._ds_id_url_link_dict.get(url.id) - if ds_id is None: - raise AssertionError(f"URL {url.id} has no data source link") - response = self._ds_id_response_dict.get(ds_id) - if response is None: - raise AssertionError(f"Data source {ds_id} has no response") - - assert response.url == url.url - assert response.description == url.description - assert response.name == url.name - - agency_ids = self._url_id_agency_link_dict.get(url.id) - assert set(response.agency_ids) == set(agency_ids) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py index 932d2518..d07ba838 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py @@ -1,7 +1,17 @@ from contextlib import contextmanager -from unittest.mock import patch +from datetime import datetime, timedelta +from unittest.mock import patch, create_autospec, AsyncMock +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.url import \ + TestDataSourcesSyncURLSetupQueryBuilder +from tests.helpers.simple_test_data_functions import generate_test_url @contextmanager @@ -11,4 +21,68 @@ def patch_sync_data_sources(side_effects: list): "sync_data_sources", side_effect=side_effects ): - yield \ No newline at end of file + yield + + + +def set_up_mock_pdap_client_responses( + mock_pdap_client: PDAPClient, + responses: list[DataSourcesSyncResponseInfo | Exception] +) -> None: + """ + Modifies: + - pdap_client.sync_data_sources + """ + mock_sync_data_sources = AsyncMock( + side_effect=responses + [DataSourcesSyncResponseInfo(data_sources=[])] + ) + mock_pdap_client.sync_data_sources = mock_sync_data_sources + +async def set_up_urls( + adb_client: AsyncDatabaseClient, + record_type: RecordType, + validated_type: ValidatedURLType | None = None, + previously_synced: bool = False, +) -> list[int]: + """Creates 2 test URLs.""" + + builder = TestDataSourcesSyncURLSetupQueryBuilder( + record_type=record_type, + validated_type=validated_type, + previously_synced=previously_synced, + ) + + return await adb_client.run_query_builder(builder) + +def _generate_test_data_source_name(i: int) -> str: + return f"Test Data Source {i}" + +def _generate_test_data_source_description(i: int) -> str: + return f"Test Data Source Description {i}" + +def set_up_sync_response_info( + ids: list[int], + record_type: RecordType, + agency_ids: list[int], + approval_status: ApprovalStatus, + ds_url_status: DataSourcesURLStatus, +) -> DataSourcesSyncResponseInfo: + yesterday = datetime.now() - timedelta(days=1) + inner_info_list: list[DataSourcesSyncResponseInnerInfo] = [] + for id_ in ids: + inner_info_list.append( + DataSourcesSyncResponseInnerInfo( + id=id_, + url=generate_test_url(id_), + name=_generate_test_data_source_name(id_), + description=_generate_test_data_source_description(id_), + record_type=record_type, + agency_ids=agency_ids, + approval_status=approval_status, + url_status=ds_url_status, + updated_at=yesterday, + ) + ) + return DataSourcesSyncResponseInfo( + data_sources=inner_info_list, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py deleted file mode 100644 index e4094b38..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py +++ /dev/null @@ -1,100 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder, AgencyAssigned -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import TestSCURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry - -ENTRIES = [ - TestURLSetupEntry( - # A URL in both DBs that should be overwritten - url='https://example.com/1', - ds_info=TestDSURLSetupEntry( - id=100, - name='Overwritten URL 1 Name', - description='Overwritten URL 1 Description', - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.ACCIDENT_REPORTS, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.TWO], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 1 Name', - description='Pre-existing URL 1 Description', - record_type=RecordType.ACCIDENT_REPORTS, - url_status=URLStatus.PENDING, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] - ), - final_url_status=URLStatus.SUBMITTED - ), - TestURLSetupEntry( - # A DS-only approved but broken URL - url='https://example.com/2', - ds_info=TestDSURLSetupEntry( - id=101, - name='New URL 2 Name', - description='New URL 2 Description', - url_status=DataSourcesURLStatus.BROKEN, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.INCARCERATION_RECORDS, - agencies_assigned=[AgencyAssigned.TWO], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=None, - final_url_status=URLStatus.NOT_FOUND - ), - TestURLSetupEntry( - # An SC-only pending URL, should be unchanged. - url='https://example.com/3', - ds_info=None, - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 3 Name', - description='Pre-existing URL 3 Description', - record_type=RecordType.FIELD_CONTACTS, - url_status=URLStatus.PENDING, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] - ), - final_url_status=URLStatus.PENDING - ), - TestURLSetupEntry( - # A DS-only rejected URL - url='https://example.com/4', - ds_info=TestDSURLSetupEntry( - id=102, - name='New URL 4 Name', - description=None, - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.REJECTED, - record_type=RecordType.ACCIDENT_REPORTS, - agencies_assigned=[AgencyAssigned.ONE], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=None, - final_url_status=URLStatus.NOT_RELEVANT - ), - TestURLSetupEntry( - # A pre-existing URL in the second response - url='https://example.com/5', - ds_info=TestDSURLSetupEntry( - id=103, - name='New URL 5 Name', - description=None, - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.INCARCERATION_RECORDS, - agencies_assigned=[AgencyAssigned.ONE], - sync_response_order=SyncResponseOrder.SECOND - ), - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 5 Name', - description='Pre-existing URL 5 Description', - record_type=None, - url_status=URLStatus.PENDING, - agencies_assigned=[] - ), - final_url_status=URLStatus.SUBMITTED - ) -] - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py deleted file mode 100644 index fd1e1da2..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py +++ /dev/null @@ -1,16 +0,0 @@ -from enum import Enum - - -class SyncResponseOrder(Enum): - """Represents which sync response the entry is in.""" - FIRST = 1 - SECOND = 2 - # No entries should be in 3 - THIRD = 3 - - -class AgencyAssigned(Enum): - """Represents which of several pre-created agencies the entry is assigned to.""" - ONE = 1 - TWO = 2 - THREE = 3 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py deleted file mode 100644 index 0321aec9..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy import select - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned - - -class AgencyAssignmentManager: - - def __init__(self, adb_client: AsyncDatabaseClient): - self.adb_client = adb_client - self._dict: dict[AgencyAssigned, int] = {} - - async def setup(self): - agencies = [] - for ag_enum in AgencyAssigned: - agency = Agency( - agency_id=ag_enum.value, - name=f"Test Agency {ag_enum.name}", - state="test_state", - county="test_county", - locality="test_locality" - ) - agencies.append(agency) - await self.adb_client.add_all(agencies) - agency_ids = await self.adb_client.scalars(select(Agency.agency_id)) - for ag_enum, agency_id in zip(AgencyAssigned, agency_ids): - self._dict[ag_enum] = agency_id - - async def get(self, ag_enum: AgencyAssigned) -> int: - return self._dict[ag_enum] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py deleted file mode 100644 index 8f1ab8fa..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py +++ /dev/null @@ -1,111 +0,0 @@ -from collections import defaultdict - -from src.db.client.async_ import AsyncDatabaseClient -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo, DataSourcesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.agency import AgencyAssignmentManager -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.queries.check import \ - CheckURLQueryBuilder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.url import URLSetupFunctor -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord - - -class DataSourcesSyncTestSetupManager: - - def __init__( - self, - adb_client: AsyncDatabaseClient, - entries: list[TestURLSetupEntry], - ): - self.adb_client = adb_client - self.entries = entries - self.agency_assignment_manager = AgencyAssignmentManager(self.adb_client) - - self.url_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} - self.ds_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} - self.sync_response_order_to_setup_record: dict[ - SyncResponseOrder, list[TestURLPostSetupRecord] - ] = defaultdict(list) - - self.response_dict: dict[ - SyncResponseOrder, list[DataSourcesSyncResponseInnerInfo] - ] = defaultdict(list) - - async def setup(self): - await self.setup_agencies() - await self.setup_entries() - - async def setup_entries(self): - for entry in self.entries: - await self.setup_entry(entry) - - async def setup_entry( - self, - entry: TestURLSetupEntry - ) -> None: - """ - Modifies: - self.url_id_to_setup_record - self.ds_id_to_setup_record - self.response_dict - """ - functor = URLSetupFunctor( - entry=entry, - agency_assignment_manager=self.agency_assignment_manager, - adb_client=self.adb_client - ) - result = await functor() - response_info = result.ds_response_info - if response_info is not None: - self.response_dict[entry.ds_info.sync_response_order].append(response_info) - if result.url_id is not None: - self.url_id_to_setup_record[result.url_id] = result - if result.data_sources_id is not None: - self.ds_id_to_setup_record[result.data_sources_id] = result - if entry.ds_info is not None: - self.sync_response_order_to_setup_record[ - entry.ds_info.sync_response_order - ].append(result) - - async def setup_agencies(self): - await self.agency_assignment_manager.setup() - - async def get_data_sources_sync_responses( - self, - orders: list[SyncResponseOrder | ValueError] - ) -> list[DataSourcesSyncResponseInfo]: - results = [] - for order in orders: - results.append( - DataSourcesSyncResponseInfo( - data_sources=self.response_dict[order] - ) - ) - return results - - async def check_via_url(self, url_id: int): - builder = CheckURLQueryBuilder( - record=self.url_id_to_setup_record[url_id] - ) - await self.adb_client.run_query_builder(builder) - - async def check_via_data_source(self, data_source_id: int): - builder = CheckURLQueryBuilder( - record=self.ds_id_to_setup_record[data_source_id] - ) - await self.adb_client.run_query_builder(builder) - - async def check_results(self): - for url_id in self.url_id_to_setup_record.keys(): - await self.check_via_url(url_id) - for data_source_id in self.ds_id_to_setup_record.keys(): - await self.check_via_data_source(data_source_id) - - async def check_via_sync_response_order(self, order: SyncResponseOrder): - records = self.sync_response_order_to_setup_record[order] - for record in records: - builder = CheckURLQueryBuilder( - record=record - ) - await self.adb_client.run_query_builder(builder) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py deleted file mode 100644 index ad1bc4c0..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py +++ /dev/null @@ -1,46 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload - -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.db.queries.base.builder import QueryBuilderBase -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord -from src.db.helpers.session import session_helper as sh - - -class CheckURLQueryBuilder(QueryBuilderBase): - - def __init__(self, record: TestURLPostSetupRecord): - super().__init__() - self.record = record - - async def run(self, session: AsyncSession) -> None: - """Check if url and associated properties match record. - Raises: - AssertionError: if url and associated properties do not match record - """ - query = ( - select(URL) - .options( - selectinload(URL.data_source), - selectinload(URL.confirmed_agencies), - ) - .outerjoin(URLDataSource, URL.id == URLDataSource.url_id) - ) - if self.record.url_id is not None: - query = query.where(URL.id == self.record.url_id) - if self.record.data_sources_id is not None: - query = query.where(URLDataSource.data_source_id == self.record.data_sources_id) - - result = await sh.one_or_none(session=session, query=query) - assert result is not None, f"URL not found for {self.record}" - await self.check_results(result) - - async def check_results(self, url: URL): - assert url.record_type == self.record.final_record_type - assert url.description == self.record.final_description - assert url.name == self.record.final_name - agencies = [agency.agency_id for agency in url.confirmed_agencies] - assert set(agencies) == set(self.record.final_agency_ids) - assert url.status == self.record.final_url_status diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py deleted file mode 100644 index 81eaa50f..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py +++ /dev/null @@ -1,97 +0,0 @@ -from pendulum import today - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.agency import AgencyAssignmentManager -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import \ - TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class URLSetupFunctor: - - def __init__( - self, - entry: TestURLSetupEntry, - agency_assignment_manager: AgencyAssignmentManager, - adb_client: AsyncDatabaseClient - ): - self.adb_client = adb_client - self.agency_assignment_manager = agency_assignment_manager - self.prime_entry = entry - self.sc_agency_ids = None - self.ds_agency_ids = None - self.sc_url_id = None - self.ds_response_info = None - - async def __call__(self) -> TestURLPostSetupRecord: - await self.setup_entry() - return TestURLPostSetupRecord( - url_id=self.sc_url_id, - sc_setup_entry=self.prime_entry.sc_info, - ds_setup_entry=self.prime_entry.ds_info, - sc_agency_ids=self.sc_agency_ids, - ds_agency_ids=self.ds_agency_ids, - ds_response_info=self.ds_response_info, - final_url_status=self.prime_entry.final_url_status, - ) - - async def setup_entry(self): - if self.prime_entry.sc_info is not None: - self.sc_url_id = await self.setup_sc_entry(self.prime_entry.sc_info) - if self.prime_entry.ds_info is not None: - self.ds_response_info = await self.setup_ds_entry(self.prime_entry.ds_info) - - async def get_agency_ids(self, ags: list[AgencyAssigned]): - results = [] - for ag in ags: - results.append(await self.agency_assignment_manager.get(ag)) - return results - - async def setup_sc_entry( - self, - entry: TestSCURLSetupEntry - ) -> int: - """Set up source collector entry and return url id.""" - self.sc_agency_ids = await self.get_agency_ids(self.prime_entry.sc_info.agencies_assigned) - url = URL( - url=self.prime_entry.url, - name=entry.name, - description=entry.description, - collector_metadata={}, - status=entry.url_status.value, - record_type=entry.record_type.value if entry.record_type is not None else None, - source=URLSource.COLLECTOR - ) - url_id = await self.adb_client.add(url, return_id=True) - links = [] - for ag_id in self.sc_agency_ids: - link = LinkURLAgency(url_id=url_id, agency_id=ag_id) - links.append(link) - await self.adb_client.add_all(links) - return url_id - - async def setup_ds_entry( - self, - ds_entry: TestDSURLSetupEntry - ) -> DataSourcesSyncResponseInnerInfo: - """Set up data source entry and return response info.""" - self.ds_agency_ids = await self.get_agency_ids(self.prime_entry.ds_info.agencies_assigned) - return DataSourcesSyncResponseInnerInfo( - id=ds_entry.id, - url=self.prime_entry.url, - name=ds_entry.name, - description=ds_entry.description, - url_status=ds_entry.url_status, - approval_status=ds_entry.approval_status, - record_type=ds_entry.record_type, - updated_at=today(), - agency_ids=self.ds_agency_ids - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py deleted file mode 100644 index 155a3ace..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py +++ /dev/null @@ -1,14 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class TestURLSetupEntry(BaseModel): - url: str - ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB - sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB - - final_url_status: URLStatus diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py deleted file mode 100644 index 47809293..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py +++ /dev/null @@ -1,20 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned, SyncResponseOrder - - -class TestDSURLSetupEntry(BaseModel): - """Represents URL previously existing in DS DB. - - These values should overwrite any SC values - """ - id: int # ID of URL in DS App - name: str - description: str | None - url_status: DataSourcesURLStatus - approval_status: ApprovalStatus - record_type: RecordType - agencies_assigned: list[AgencyAssigned] - sync_response_order: SyncResponseOrder diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py deleted file mode 100644 index e535cd56..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py +++ /dev/null @@ -1,50 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import \ - TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class TestURLPostSetupRecord(BaseModel): - """Stores a setup entry along with relevant database-generated ids""" - url_id: int | None - sc_setup_entry: TestSCURLSetupEntry | None - ds_setup_entry: TestDSURLSetupEntry | None - sc_agency_ids: list[int] | None - ds_agency_ids: list[int] | None - ds_response_info: DataSourcesSyncResponseInnerInfo | None - final_url_status: URLStatus - - @property - def data_sources_id(self) -> int | None: - if self.ds_setup_entry is None: - return None - return self.ds_setup_entry.id - - @property - def final_record_type(self) -> RecordType: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.record_type - return self.sc_setup_entry.record_type - - @property - def final_name(self) -> str: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.name - return self.sc_setup_entry.name - - @property - def final_description(self) -> str: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.description - return self.sc_setup_entry.description - - @property - def final_agency_ids(self) -> list[int] | None: - if self.ds_setup_entry is not None: - return self.ds_agency_ids - return self.sc_agency_ids \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py deleted file mode 100644 index c151d783..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py +++ /dev/null @@ -1,17 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned - - -class TestSCURLSetupEntry(BaseModel): - """Represents URL previously existing in SC DB. - - These values should be overridden by any DS values - """ - name: str - description: str - record_type: RecordType | None - url_status: URLStatus - agencies_assigned: list[AgencyAssigned] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py new file mode 100644 index 00000000..4c3c4f38 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py @@ -0,0 +1,59 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.db.templates.requester import RequesterBase +from tests.helpers.simple_test_data_functions import generate_test_name, generate_test_url + + +class TestDataSourcesSyncURLSetupQueryRequester(RequesterBase): + + async def insert_urls( + self, + record_type: RecordType, + ) -> list[int]: + + insert_models: list[URLInsertModel] = [] + for i in range(2): + url = URLInsertModel( + url=generate_test_url(i), + name=generate_test_name(i), + record_type=record_type, + source=URLSource.COLLECTOR, + ) + insert_models.append(url) + + return await self.session_helper.bulk_insert(self.session, models=insert_models, return_ids=True) + + async def insert_validated_flags( + self, + url_ids: list[int], + validated_type: ValidatedURLType + ) -> None: + to_insert: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=validated_type, + ) + to_insert.append(flag) + + await self.session_helper.bulk_insert(self.session, models=to_insert) + + async def insert_data_source_entry( + self, + url_ids: list[int], + ): + to_insert: list[URLDataSourcePydantic] = [ + URLDataSourcePydantic( + url_id=url_id, + data_source_id=url_id, + ) + for url_id in url_ids + ] + + await self.session_helper.bulk_insert(self.session, models=to_insert) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py new file mode 100644 index 00000000..47b859e3 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py @@ -0,0 +1,35 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.requester import \ + TestDataSourcesSyncURLSetupQueryRequester + + +class TestDataSourcesSyncURLSetupQueryBuilder(QueryBuilderBase): + + def __init__( + self, + record_type: RecordType, + validated_type: ValidatedURLType | None = None, + previously_synced: bool = False, + ): + super().__init__() + self.record_type = record_type + self.validated_type = validated_type + self.previously_synced = previously_synced + + async def run(self, session: AsyncSession) -> list[int]: + requester = TestDataSourcesSyncURLSetupQueryRequester(session=session) + + url_ids: list[int] = await requester.insert_urls(record_type=self.record_type) + + if self.validated_type is not None: + await requester.insert_validated_flags(url_ids=url_ids, validated_type=self.validated_type) + + if self.previously_synced: + await requester.insert_data_source_entry(url_ids=url_ids) + + return url_ids + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py new file mode 100644 index 00000000..685132df --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py @@ -0,0 +1,76 @@ +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_urls + +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_db_only( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime +): + """ + Test that operator does nothing with entries only in the database, and nothing is returned by the endpoint. + """ + + # Add URLs to database + url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + validated_type=None, + ) + + # Set up pdap client to return nothing + set_up_mock_pdap_client_responses( + operator.pdap_client, + responses=[ + DataSourcesSyncResponseInfo(data_sources=[]) + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + assert operator.pdap_client.sync_data_sources.call_count == 1 + assert operator.pdap_client.sync_data_sources.call_args[0][0] == DataSourcesSyncParameters( + cutoff_date=None, + page=1 + ) + + # Confirm URLs are unchanged in database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == len(url_ids) + assert {url.id for url in urls} == set(url_ids) + assert all(url.status == URLStatus.OK for url in urls) + assert all(url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls) + + # Confirm presence of sync status row with cutoff date and last updated at after initial db time + await check_sync_concluded( + adb_client_test, + check_updated_at=False, + current_db_datetime=current_db_time + ) + + # Confirm no validated flags + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 0 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py deleted file mode 100644 index 41f38b2a..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py +++ /dev/null @@ -1,62 +0,0 @@ -from unittest.mock import MagicMock, call - -import pytest - -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_data_sources_sync_happy_path( - test_operator: SyncDataSourcesTaskOperator -): - adb_client = test_operator.adb_client - - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES - ) - await manager.setup() - - with patch_sync_data_sources( - await manager.get_data_sources_sync_responses([order for order in SyncResponseOrder]) - ): - run_info = await test_operator.run_task() - assert_task_run_success(run_info) - mock_func: MagicMock = test_operator.pdap_client.sync_data_sources - - mock_func.assert_has_calls( - [ - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=1 - ) - ), - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=2 - ) - ), - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=3 - ) - ) - ] - ) - await check_sync_concluded(adb_client, check_updated_at=False) - - # Check results according to expectations. - await manager.check_results() - - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py index 0441a102..3aa26866 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py @@ -1,50 +1,73 @@ +from datetime import datetime + import pytest from sqlalchemy import select +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager - - +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error @pytest.mark.asyncio async def test_data_sources_sync_interruption( - test_operator: SyncDataSourcesTaskOperator + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime, + agency_ids: list[int] ): - adb_client = test_operator.adb_client + """ + Test that in the case of an interruption. + The data sources sync will resume from the last processed page. + """ - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES + # Set up endpoint to return URLs on page 1, raise error on page 2 + # return URLs on page 2 on the second call, and return nothing on page 3 + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ValueError("test ds sync error"), + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] ) - await manager.setup() - first_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.FIRST] - ) - with patch_sync_data_sources( - side_effects= - first_response + - [ValueError("test error")] - ): - run_info = await test_operator.run_task() - assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() - await manager.check_via_sync_response_order(SyncResponseOrder.FIRST) + # Confirm presence of error + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert "test ds sync error" in run_info.message - # Second response should not be processed - with pytest.raises(AssertionError): - await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + # Confirm first URLs added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 - # Check sync state results - sync_state_results = await adb_client.scalar( + # Confirm sync status updated to page 2 and cutoff date is null + sync_state_results = await adb_client_test.scalar( select( DataSourcesSyncState ) @@ -53,13 +76,22 @@ async def test_data_sources_sync_interruption( assert sync_state_results.last_full_sync_at is None assert sync_state_results.current_cutoff_date is None - second_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.SECOND, SyncResponseOrder.THIRD] - ) - with patch_sync_data_sources(second_response): - await test_operator.run_task() + # Run operator again + run_info: TaskOperatorRunInfo = await operator.run_task() - await check_sync_concluded(adb_client) + # Confirm operator ran without error + assert_task_ran_without_error(run_info) - await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) - await manager.check_via_sync_response_order(SyncResponseOrder.THIRD) \ No newline at end of file + # Confirm second URLs added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 4 + + # Confirm page updated to null and cutoff date updated + sync_state_results = await adb_client_test.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at is not None + assert sync_state_results.current_cutoff_date is not None diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py new file mode 100644 index 00000000..0ae831bd --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py @@ -0,0 +1,107 @@ +from datetime import datetime, timedelta + +import pytest +from sqlalchemy import select + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_ds_sync_multiple_calls( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime, + agency_ids: list[int] +): + """ + Test that operator properly handles multiple calls to sync endpoint. + """ + + # Set up endpoint to return URLs on page 1 and 2, and stop on page 3 + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + + # Confirm URLs are added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert all(url.status == URLStatus.OK for url in urls) + assert all(url.record_type == RecordType.ACCIDENT_REPORTS for url in urls) + url_ids: list[int] = [url.id for url in urls] + + # Confirm 3 calls to pdap_client.sync_data_sources + assert operator.pdap_client.sync_data_sources.call_count == 3 + + # Confirm sync status updated + sync_state_results = await adb_client_test.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at > current_db_time - timedelta(minutes=5) + assert sync_state_results.current_cutoff_date > (current_db_time - timedelta(days=2)).date() + + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] + ) + + # Run operator again + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Confirm no new URLs added + urls: list[URL] = await adb_client_test.get_all(URL) + assert set([url.id for url in urls]) == set(url_ids) + + # Confirm call to pdap_client.sync_data_sources made with cutoff_date + assert operator.pdap_client.sync_data_sources.called_once_with( + DataSourcesSyncParameters( + cutoff_date=sync_state_results.current_cutoff_date, + page=1 + ) + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py deleted file mode 100644 index ebcbe856..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py +++ /dev/null @@ -1,59 +0,0 @@ -from datetime import datetime -from unittest.mock import MagicMock - -import pytest - -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_data_sources_sync_no_new_results( - test_operator: SyncDataSourcesTaskOperator -): - adb_client = test_operator.adb_client - - cutoff_date = datetime(2025, 5, 1).date() - - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES - ) - await manager.setup() - - first_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.THIRD] - ) - - # Add cutoff date to database - await adb_client.add( - DataSourcesSyncState( - current_cutoff_date=cutoff_date - ) - ) - - with patch_sync_data_sources(first_response): - run_info = await test_operator.run_task() - assert_task_run_success(run_info) - mock_func: MagicMock = test_operator.pdap_client.sync_data_sources - - mock_func.assert_called_once_with( - DataSourcesSyncParameters( - cutoff_date=cutoff_date, - page=1 - ) - ) - await check_sync_concluded(adb_client, check_updated_at=False) - - # Check no syncs occurred - for sync_response_order in [SyncResponseOrder.FIRST, SyncResponseOrder.SECOND]: - with pytest.raises(AssertionError): - await manager.check_via_sync_response_order(sync_response_order) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py new file mode 100644 index 00000000..e7a9a5a0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py @@ -0,0 +1,85 @@ +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_broken_approved( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int], + current_db_time: datetime +): + """ + Test that a data source with + - a broken URL status + - an approved status + Is added to the data source with a 404 Not Found status. + """ + + # Set up pdap client to return url with broken url status but approved + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.BROKEN, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm presence of URL with status of `404 not found` + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.NOT_FOUND for url in urls]) + assert all([url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm presence of agencies + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 8 + assert set(link.url_id for link in links) == set(url_ids) + assert set(link.agency_id for link in links) == set(agency_ids) + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == ValidatedURLType.DATA_SOURCE for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) + + # Confirm presence of sync status row + await check_sync_concluded( + adb_client_test, + current_db_datetime=current_db_time + ) + + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py new file mode 100644 index 00000000..a1e0bf2c --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py @@ -0,0 +1,94 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_in_db_overwritten_by_ds( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int] +): + """ + Test that a URL in the database is overwritten by a data source with the same URL, + if their information is different. + """ + old_agency_ids: list[int] = agency_ids[:2] + new_agency_ids: list[int] = agency_ids[2:4] + + + # Add URLs to database + url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + validated_type=ValidatedURLType.DATA_SOURCE, + ) + # Link URLs to 2 existing agencies + links: list[LinkURLAgency] = [] + for url_id in url_ids: + for agency_id in old_agency_ids: + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id, + ) + links.append(link) + await adb_client_test.add_all(links) + + # Set up pdap client to return same URLs with different information + # - different name + # - different description + # - different status + # - different approval status (approved vs. not relevant) + # - different record type + # - different agencies assigned + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=new_agency_ids, + approval_status=ApprovalStatus.REJECTED, + ds_url_status=DataSourcesURLStatus.BROKEN, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + + # Confirm URL name, description, record type, and status are overwritten + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.NOT_FOUND for url in urls]) + assert all([url.record_type == RecordType.ACCIDENT_REPORTS for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm agencies are overwritten + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 4 + assert set(link.url_id for link in links) == set(url_ids) + assert set(link.agency_id for link in links) == set(new_agency_ids) + + # Confirm validated types overwritten + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == ValidatedURLType.NOT_RELEVANT for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py new file mode 100644 index 00000000..bc55a5be --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py @@ -0,0 +1,63 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_ok_approved( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int] +): + """ + Test that a URL with an OK URL status and an approved status + is added to the database with an OK status + and a validated flag with `submitted=True` + """ + + # Set up pdap client to return url with ok url status and approved + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.OTHER, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm URL is added to database with OK status + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.OK for url in urls]) + assert all([url.record_type == RecordType.OTHER for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == ValidatedURLType.DATA_SOURCE for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py index dc261c12..13950c89 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py @@ -47,7 +47,7 @@ async def test_agency_identification_task( urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLStatus.OK, with_html_content=True ), TestURLCreationParameters( @@ -58,14 +58,14 @@ async def test_agency_identification_task( ] ) ) - collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id + collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLStatus.OK].url_mappings[0].url_id # Create an additional two urls with no collector. response = await db_data_creator.url_v2( parameters=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLStatus.OK, with_html_content=True ), TestURLCreationParameters( @@ -75,7 +75,7 @@ async def test_agency_identification_task( ) ] ) - collector_type_to_url_id[None] = response.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id + collector_type_to_url_id[None] = response.urls_by_status[URLStatus.OK].url_mappings[0].url_id # Confirm meets prerequisites diff --git a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py index 81b03070..5943213b 100644 --- a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py @@ -32,7 +32,7 @@ async def test_url_auto_relevant_task(db_data_creator): assert len(urls) == 3 counter = Counter([url.status for url in urls]) assert counter[URLStatus.ERROR] == 1 - assert counter[URLStatus.PENDING] == 2 + assert counter[URLStatus.OK] == 2 # Confirm two annotations were created suggestions: list[AutoRelevantSuggestion] = await adb_client.get_all(AutoRelevantSuggestion) diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py index 76f1969e..c0dbef6a 100644 --- a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py +++ b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py @@ -3,7 +3,6 @@ from src.external.url_request.dtos.url_response import URLResponseInfo from tests.automated.integration.tasks.url.impl.html.setup.data import TEST_ENTRIES from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestErrorType -from tests.helpers.simple_test_data_functions import generate_test_html def _get_success( @@ -29,6 +28,19 @@ def _get_content_type( return None return "text/html" +def _generate_test_html() -> str: + return """ + + + + Example HTML + + +

Example HTML

+

This is an example of HTML content.

+ + + """ def setup_url_to_response_info( ) -> dict[str, URLResponseInfo]: @@ -37,7 +49,7 @@ def setup_url_to_response_info( response_info = URLResponseInfo( success=_get_success(entry), status=get_http_status(entry), - html=generate_test_html() if _get_success(entry) else None, + html=_generate_test_html() if _get_success(entry) else None, content_type=_get_content_type(entry), exception=None if _get_success(entry) else "Error" ) diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index e9495ad4..5615392c 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -11,7 +11,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://happy-path.com/pending", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -66,7 +66,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://not-200-path.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -83,7 +83,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://no-web-metadata.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=None, expected_result=ExpectedResult( diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py index a02f1ba4..ecaec084 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py @@ -28,14 +28,14 @@ async def test_url_probe_task_no_redirect_ok( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index 0c1da5fd..9d77c26f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -31,7 +31,7 @@ async def test_two_urls( ] ) assert not await operator.meets_task_prerequisites() - url_id_1 = await setup_manager.setup_url(URLStatus.PENDING, url=url_1) + url_id_1 = await setup_manager.setup_url(URLStatus.OK, url=url_1) url_id_2 = await setup_manager.setup_url(URLStatus.NOT_RELEVANT, url=url_2) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py index 88098b16..df695021 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py @@ -28,12 +28,12 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.PENDING) + source_url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, @@ -45,7 +45,7 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_url_id = await check_manager.check_redirect(source_url_id) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 0744f3b9..20671624 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -30,7 +30,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( ) ) source_url_id = await setup_manager.setup_url(URLStatus.INDIVIDUAL_RECORD) - dest_url_id = await setup_manager.setup_url(URLStatus.PENDING, url=TEST_DEST_URL) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, @@ -48,7 +48,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( ) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py index ed9c38ac..5a66af3d 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py @@ -27,11 +27,11 @@ async def test_url_probe_task_redirect_infinite( redirect_url=TEST_URL ) ) - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index 267d9015..f0e113ff 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -34,17 +34,17 @@ async def test_url_probe_task_redirect_two_urls_same_dest( ), ] ) - source_url_id_1 = await setup_manager.setup_url(URLStatus.PENDING) - source_url_id_2 = await setup_manager.setup_url(URLStatus.PENDING, url="https://example.com/2") + source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id_1, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=source_url_id_2, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) redirect_url_id_1 = await check_manager.check_redirect( source_url_id=source_url_id_1 diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 630f7f4e..25289b38 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -84,7 +84,7 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLStatus.OK, with_html_content=True ), TestURLCreationParameters( @@ -104,7 +104,7 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - pending_url_mappings = creation_info.urls_by_status[URLStatus.PENDING].url_mappings + pending_url_mappings = creation_info.urls_by_status[URLStatus.OK].url_mappings url_id_success = pending_url_mappings[0].url_id url_id_404 = pending_url_mappings[1].url_id url_id_error = pending_url_mappings[2].url_id @@ -128,9 +128,9 @@ def find_url(url_id: int) -> URL: return url raise Exception(f"URL with id {url_id} not found") - assert find_url(url_id_success).status == URLStatus.PENDING + assert find_url(url_id_success).status == URLStatus.OK assert find_url(url_id_404).status == URLStatus.NOT_FOUND - assert find_url(url_id_error).status == URLStatus.PENDING + assert find_url(url_id_error).status == URLStatus.OK assert find_url(url_id_initial_error).status == URLStatus.ERROR # Check that meets_task_prerequisites now returns False diff --git a/tests/helpers/batch_creation_parameters/core.py b/tests/helpers/batch_creation_parameters/core.py index dfc33644..4562cbdf 100644 --- a/tests/helpers/batch_creation_parameters/core.py +++ b/tests/helpers/batch_creation_parameters/core.py @@ -9,10 +9,10 @@ class TestBatchCreationParameters(BaseModel): - created_at: Optional[datetime.datetime] = None + created_at: datetime.datetime | None = None outcome: BatchStatus = BatchStatus.READY_TO_LABEL strategy: CollectorType = CollectorType.EXAMPLE - urls: Optional[list[TestURLCreationParameters]] = None + urls: list[TestURLCreationParameters] | None = None @model_validator(mode='after') def validate_urls(self): diff --git a/tests/helpers/batch_creation_parameters/enums.py b/tests/helpers/batch_creation_parameters/enums.py new file mode 100644 index 00000000..d61a2793 --- /dev/null +++ b/tests/helpers/batch_creation_parameters/enums.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class URLCreationEnum(Enum): + OK = "ok" + SUBMITTED = "submitted" + VALIDATED = "validated" + ERROR = "error" + NOT_RELEVANT = "not_relevant" + DUPLICATE = "duplicate" + NOT_FOUND = "not_found" \ No newline at end of file diff --git a/tests/helpers/batch_creation_parameters/url_creation_parameters.py b/tests/helpers/batch_creation_parameters/url_creation_parameters.py index 2e30cca0..701a239b 100644 --- a/tests/helpers/batch_creation_parameters/url_creation_parameters.py +++ b/tests/helpers/batch_creation_parameters/url_creation_parameters.py @@ -1,23 +1,26 @@ from pydantic import BaseModel, model_validator from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.collectors.enums import URLStatus from src.core.enums import RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class TestURLCreationParameters(BaseModel): count: int = 1 - status: URLStatus = URLStatus.PENDING + status: URLCreationEnum = URLCreationEnum.OK with_html_content: bool = False annotation_info: AnnotationInfo = AnnotationInfo() @model_validator(mode='after') def validate_annotation_info(self): - if self.status == URLStatus.NOT_RELEVANT: + if self.status == URLCreationEnum.NOT_RELEVANT: self.annotation_info.final_review_approved = False return self - if self.status != URLStatus.VALIDATED: + if self.status not in ( + URLCreationEnum.SUBMITTED, + URLCreationEnum.VALIDATED + ): return self # Assume is validated diff --git a/tests/helpers/counter.py b/tests/helpers/counter.py new file mode 100644 index 00000000..8d9de1a0 --- /dev/null +++ b/tests/helpers/counter.py @@ -0,0 +1,7 @@ + +from itertools import count + +COUNTER = count(1) + +def next_int() -> int: + return next(COUNTER) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/batch.py b/tests/helpers/data_creator/commands/impl/batch.py index 69583a45..6871661d 100644 --- a/tests/helpers/data_creator/commands/impl/batch.py +++ b/tests/helpers/data_creator/commands/impl/batch.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/urls_/__init__.py b/tests/helpers/data_creator/commands/impl/urls_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py new file mode 100644 index 00000000..32ec321a --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -0,0 +1,36 @@ +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum + + +def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) -> URLStatus: + match url_creation_enum: + case URLCreationEnum.OK: + return URLStatus.OK + case URLCreationEnum.SUBMITTED: + return URLStatus.OK + case URLCreationEnum.VALIDATED: + return URLStatus.OK + case URLCreationEnum.NOT_RELEVANT: + return URLStatus.OK + case URLCreationEnum.ERROR: + return URLStatus.ERROR + case URLCreationEnum.DUPLICATE: + return URLStatus.DUPLICATE + case URLCreationEnum.NOT_FOUND: + return URLStatus.NOT_FOUND + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") + +def convert_url_creation_enum_to_validated_type( + url_creation_enum: URLCreationEnum +) -> ValidatedURLType: + match url_creation_enum: + case URLCreationEnum.SUBMITTED: + return ValidatedURLType.DATA_SOURCE + case URLCreationEnum.VALIDATED: + return ValidatedURLType.DATA_SOURCE + case URLCreationEnum.NOT_RELEVANT: + return ValidatedURLType.NOT_RELEVANT + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls_/query.py similarity index 79% rename from tests/helpers/data_creator/commands/impl/urls.py rename to tests/helpers/data_creator/commands/impl/urls_/query.py index ee9ef954..7587abfb 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,11 +1,12 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_url_status from tests.helpers.simple_test_data_functions import generate_test_urls @@ -16,7 +17,7 @@ def __init__( batch_id: int | None, url_count: int, collector_metadata: dict | None = None, - status: URLStatus = URLStatus.PENDING, + status: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ): super().__init__() @@ -36,8 +37,11 @@ def run_sync(self) -> InsertURLsInfo: url_infos.append( URLInfo( url=url, - status=self.status, - name="Test Name" if self.status == URLStatus.VALIDATED else None, + status=convert_url_creation_enum_to_url_status(self.status), + name="Test Name" if self.status in ( + URLCreationEnum.VALIDATED, + URLCreationEnum.SUBMITTED, + ) else None, collector_metadata=self.collector_metadata, created_at=self.created_at, source=URLSource.COLLECTOR @@ -50,7 +54,7 @@ def run_sync(self) -> InsertURLsInfo: ) # If outcome is submitted, also add entry to DataSourceURL - if self.status == URLStatus.SUBMITTED: + if self.status == URLCreationEnum.SUBMITTED: submitted_url_infos = [] for url_id in url_insert_info.url_ids: submitted_url_info = SubmittedURLInfo( diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/core.py b/tests/helpers/data_creator/commands/impl/urls_v2/core.py index c80dc447..f7042720 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/core.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/core.py @@ -1,14 +1,16 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_validated_type +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response -from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.data_creator.generate import generate_validated_flags from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo @@ -26,7 +28,7 @@ def __init__( self.created_at = created_at async def run(self) -> URLsV2Response: - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] # Create urls for url_parameters in self.parameters: diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/response.py b/tests/helpers/data_creator/commands/impl/urls_v2/response.py index db19328e..74aa8e20 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/response.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/response.py @@ -1,9 +1,10 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class URLsV2Response(BaseModel): - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 096bad32..389b6f66 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -7,6 +7,8 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient from src.db.enums import TaskType @@ -14,6 +16,7 @@ from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand @@ -28,9 +31,11 @@ from tests.helpers.data_creator.commands.impl.suggestion.user.record_type import UserRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.relevant import UserRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.url_metadata import URLMetadataCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response +from tests.helpers.data_creator.create import create_urls, create_batch, create_batch_url_links, create_validated_flags, \ + create_url_data_sources from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 @@ -105,7 +110,7 @@ async def batch_and_urls( url_count: int = 3, with_html_content: bool = False, batch_status: BatchStatus = BatchStatus.READY_TO_LABEL, - url_status: URLStatus = URLStatus.PENDING + url_status: URLCreationEnum = URLCreationEnum.OK ) -> BatchURLCreationInfo: batch_id = self.batch( strategy=strategy, @@ -239,7 +244,7 @@ def urls( batch_id: int, url_count: int, collector_metadata: dict | None = None, - outcome: URLStatus = URLStatus.PENDING, + outcome: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ) -> InsertURLsInfo: command = URLsDBDataCreatorCommand( @@ -368,3 +373,103 @@ async def url_metadata( status_code=status_code ) ) + + async def create_validated_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + validation_type: ValidatedURLType = ValidatedURLType.DATA_SOURCE, + count: int = 1 + ) -> list[int]: + url_ids: list[int] = await self.create_urls( + record_type=record_type, + count=count + ) + await self.create_validated_flags( + url_ids=url_ids, + validation_type=validation_type + ) + return url_ids + + async def create_submitted_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + count: int = 1 + ): + url_ids: list[int] = await self.create_urls( + record_type=record_type, + count=count + ) + await self.create_validated_flags( + url_ids=url_ids, + validation_type=ValidatedURLType.DATA_SOURCE + ) + await self.create_url_data_sources(url_ids=url_ids) + return url_ids + + + async def create_urls( + self, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + count: int = 1, + batch_id: int | None = None + ): + + url_ids: list[int] = await create_urls( + adb_client=self.adb_client, + status=status, + source=source, + record_type=record_type, + count=count + ) + if batch_id is not None: + await self.create_batch_url_links( + url_ids=url_ids, + batch_id=batch_id + ) + return url_ids + + async def create_batch( + self, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), + ): + return await create_batch( + adb_client=self.adb_client, + status=status, + strategy=strategy, + date_generated=date_generated + ) + + async def create_batch_url_links( + self, + url_ids: list[int], + batch_id: int, + ): + return await create_batch_url_links( + adb_client=self.adb_client, + url_ids=url_ids, + batch_id=batch_id + ) + + async def create_validated_flags( + self, + url_ids: list[int], + validation_type: ValidatedURLType, + ): + return await create_validated_flags( + adb_client=self.adb_client, + url_ids=url_ids, + validation_type=validation_type + ) + + async def create_url_data_sources( + self, + url_ids: list[int], + ): + return await create_url_data_sources( + adb_client=self.adb_client, + url_ids=url_ids + ) diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py new file mode 100644 index 00000000..af927b98 --- /dev/null +++ b/tests/helpers/data_creator/create.py @@ -0,0 +1,71 @@ +from datetime import datetime + +from src.collectors.enums import CollectorType, URLStatus +from src.core.enums import BatchStatus, RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ + generate_url_data_sources, generate_batch_url_links + + +async def create_batch( + adb_client: AsyncDatabaseClient, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> int: + batch: BatchInsertModel = generate_batch(status=status, strategy=strategy, date_generated=date_generated) + return (await adb_client.bulk_insert([batch], return_ids=True))[0] + +async def create_urls( + adb_client: AsyncDatabaseClient, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + count: int = 1 +) -> list[int]: + urls: list[URLInsertModel] = generate_urls( + status=status, + source=source, + record_type=record_type, + count=count, + ) + return await adb_client.bulk_insert(urls, return_ids=True) + +async def create_validated_flags( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + validation_type: ValidatedURLType, +) -> None: + validated_flags: list[FlagURLValidatedPydantic] = generate_validated_flags( + url_ids=url_ids, + validation_type=validation_type, + ) + await adb_client.bulk_insert(validated_flags) + +async def create_url_data_sources( + adb_client: AsyncDatabaseClient, + url_ids: list[int], +) -> None: + url_data_sources: list[URLDataSourcePydantic] = generate_url_data_sources( + url_ids=url_ids, + ) + await adb_client.bulk_insert(url_data_sources) + +async def create_batch_url_links( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + batch_id: int, +) -> None: + batch_url_links: list[LinkBatchURLPydantic] = generate_batch_url_links( + url_ids=url_ids, + batch_id=batch_id, + ) + await adb_client.bulk_insert(batch_url_links) + diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py new file mode 100644 index 00000000..5caf4d2c --- /dev/null +++ b/tests/helpers/data_creator/generate.py @@ -0,0 +1,80 @@ +from datetime import datetime + +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus, RecordType +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.counter import next_int + + +def generate_batch( + status: BatchStatus, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> BatchInsertModel: + return BatchInsertModel( + strategy=strategy.value, + status=status, + parameters={}, + user_id=1, + date_generated=date_generated, + ) + +def generate_batch_url_links( + url_ids: list[int], + batch_id: int +) -> list[LinkBatchURLPydantic]: + return [ + LinkBatchURLPydantic( + url_id=url_id, + batch_id=batch_id, + ) + for url_id in url_ids + ] + +def generate_urls( + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + count: int = 1 +) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] + for i in range(count): + val: int = next_int() + results.append(URLInsertModel( + url=f"http://example.com/{val}", + status=status, + source=source, + name=f"Example {val}", + record_type=record_type, + )) + return results + +def generate_validated_flags( + url_ids: list[int], + validation_type: ValidatedURLType, +) -> list[FlagURLValidatedPydantic]: + return [ + FlagURLValidatedPydantic( + url_id=url_id, + type=validation_type, + ) + for url_id in url_ids + ] + +def generate_url_data_sources( + url_ids: list[int], +) -> list[URLDataSourcePydantic]: + return [ + URLDataSourcePydantic( + url_id=url_id, + data_source_id=url_id, + ) + for url_id in url_ids + ] \ No newline at end of file diff --git a/tests/helpers/data_creator/insert.py b/tests/helpers/data_creator/insert.py new file mode 100644 index 00000000..06b207e3 --- /dev/null +++ b/tests/helpers/data_creator/insert.py @@ -0,0 +1,10 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +async def bulk_insert_all( + adb_client: AsyncDatabaseClient, + lists_of_models: list[list[BulkInsertableModel]], +): + for list_of_models in lists_of_models: + await adb_client.bulk_insert(list_of_models) \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/batch/v2.py b/tests/helpers/data_creator/models/creation_info/batch/v2.py index 3e6ed74a..52d7e37d 100644 --- a/tests/helpers/data_creator/models/creation_info/batch/v2.py +++ b/tests/helpers/data_creator/models/creation_info/batch/v2.py @@ -1,12 +1,12 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class BatchURLCreationInfoV2(BaseModel): batch_id: int - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} @property def url_ids(self) -> list[int]: diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 082769e7..16c45a0a 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -5,11 +5,12 @@ from src.collectors.enums import URLStatus from src.db.dtos.url.mapping import URLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): url_mappings: list[URLMapping] - outcome: URLStatus + outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None @property diff --git a/tests/helpers/setup/annotation/core.py b/tests/helpers/setup/annotation/core.py index ff5105cd..bbc83bbc 100644 --- a/tests/helpers/setup/annotation/core.py +++ b/tests/helpers/setup/annotation/core.py @@ -6,7 +6,7 @@ async def setup_for_get_next_url_for_annotation( db_data_creator: DBDataCreator, url_count: int, - outcome: URLStatus = URLStatus.PENDING + outcome: URLStatus = URLStatus.OK ) -> AnnotationSetupInfo: batch_id = db_data_creator.batch() insert_urls_info = db_data_creator.urls( diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index df455e0e..7c42fd8d 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -13,16 +13,15 @@ def generate_test_urls(count: int) -> list[str]: return results -def generate_test_html() -> str: - return """ - - - - Example HTML - - -

Example HTML

-

This is an example of HTML content.

- - - """ \ No newline at end of file + +def generate_test_url(i: int) -> str: + return f"https://test.com/{i}" + +def generate_test_name(i: int) -> str: + return f"Test Name {i}" + +def generate_test_description(i: int) -> str: + return f"Test description {i}" + +def generate_test_html(i: int) -> str: + return f"

Test {i}

" \ No newline at end of file diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index 584facdd..bc9b5dfa 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,7 +2,7 @@ import dotenv -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 9a896392..66020a92 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,4 +1,4 @@ -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from src.collectors.impl.ckan import group_search, package_search, organization_search diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 417e7240..216638dc 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,4 +1,4 @@ -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion From 85b134f7fc8d69a9c2b9b69c3b8c85d8686b77c2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 25 Aug 2025 07:47:54 -0400 Subject: [PATCH 04/33] Fix last tests --- .../get_next_url_for_user_annotation.py | 7 ++++++- .../operators/submit_approved/queries/get.py | 3 ++- .../submit_approved/queries/has_validated.py | 6 +++++- .../submit_approved/queries/mark_submitted.py | 9 --------- .../db/client/approve_url/test_basic.py | 9 ++++++++- .../test_only_confirmed_urls.py | 3 ++- .../test_validated.py | 14 ++++--------- .../happy_path/test_happy_path.py | 13 ++++++------ .../url/impl/probe/no_redirect/test_error.py | 15 +++++++++++--- .../impl/probe/no_redirect/test_not_found.py | 10 +++++++--- .../impl/probe/no_redirect/test_two_urls.py | 2 +- .../probe/redirect/test_dest_exists_in_db.py | 4 ++-- .../test_submit_approved_url_task.py | 20 +++++++++---------- .../tasks/url/impl/test_url_404_probe.py | 9 +++++---- tests/helpers/setup/annotation/core.py | 3 ++- 15 files changed, 73 insertions(+), 54 deletions(-) diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index cce1a969..6eed4b07 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,6 +5,7 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion @@ -32,6 +33,10 @@ async def run(self, session: AsyncSession): select( URL, ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) ) if self.batch_id is not None: @@ -43,7 +48,7 @@ async def run(self, session: AsyncSession): query = ( query - .where(URL.status == URLStatus.OK.value) + .where(FlagURLValidated.url_id.is_(None)) # URL must not have user suggestion .where( StatementComposer.user_suggestion_not_exists(self.user_suggestion_model_to_exclude) diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 6c22c731..dc51dfbb 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -4,6 +4,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -29,7 +30,7 @@ async def _process_results(self, urls): async def _build_query(): query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join(FlagURLValidated, FlagURLValidated.url_id == URL.id) .options( selectinload(URL.optional_data_source_metadata), selectinload(URL.confirmed_agencies), diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py index abd94d20..a554b8be 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py @@ -2,6 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -11,7 +12,10 @@ class HasValidatedURLsQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) ) urls = await session.execute(query) urls = urls.scalars().all() diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index d2563335..4ebfef56 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -19,14 +19,6 @@ async def run(self, session: AsyncSession): url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -35,4 +27,3 @@ async def run(self, session: AsyncSession): url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - await session.execute(query) \ No newline at end of file diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 2a7f9569..62f215fb 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,6 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -42,10 +43,16 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "Test Name" assert url.description == "Test Description" + # Confirm presence of validated flag + validated_flags: list[FlagURLValidated] = await adb_client.get_all(FlagURLValidated) + assert len(validated_flags) == 1 + assert validated_flags[0].url_id == url_mapping.url_id + + confirmed_agency: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py index 7e68ada4..72706aaf 100644 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py +++ b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -14,7 +15,7 @@ async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator url_mapping = db_data_creator.urls( batch_id=batch_id, url_count=1, - outcome=URLStatus.SUBMITTED + outcome=URLCreationEnum.SUBMITTED ).url_mappings[0] result = await db_data_creator.adb_client.get_next_url_for_final_review( diff --git a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py index 95e40847..7ddc11fb 100644 --- a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py +++ b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.setup.annotation.core import setup_for_get_next_url_for_annotation from tests.helpers.data_creator.core import DBDataCreator @@ -12,19 +13,12 @@ async def test_get_next_url_for_user_relevance_annotation_validated( """ A validated URL should not turn up in get_next_url_for_user_annotation """ - - setup_info = await setup_for_get_next_url_for_annotation( - db_data_creator=db_data_creator, - url_count=1, - outcome=URLStatus.VALIDATED - ) - - - url_1 = setup_info.insert_urls_info.url_mappings[0] + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls())[0] # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, + url_id=url_1, relevant=True ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py index 13950c89..ff9898fe 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py @@ -13,6 +13,7 @@ from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.asserts import \ assert_expected_confirmed_and_auto_suggestions from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 @@ -47,35 +48,35 @@ async def test_agency_identification_task( urls=[ TestURLCreationParameters( count=1, - status=URLStatus.OK, + status=URLCreationEnum.OK, with_html_content=True ), TestURLCreationParameters( count=1, - status=URLStatus.ERROR, + status=URLCreationEnum.ERROR, with_html_content=True ) ] ) ) - collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLStatus.OK].url_mappings[0].url_id + collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLCreationEnum.OK].url_mappings[0].url_id # Create an additional two urls with no collector. response = await db_data_creator.url_v2( parameters=[ TestURLCreationParameters( count=1, - status=URLStatus.OK, + status=URLCreationEnum.OK, with_html_content=True ), TestURLCreationParameters( count=1, - status=URLStatus.ERROR, + status=URLCreationEnum.ERROR, with_html_content=True ) ] ) - collector_type_to_url_id[None] = response.urls_by_status[URLStatus.OK].url_mappings[0].url_id + collector_type_to_url_id[None] = response.urls_by_status[URLCreationEnum.OK].url_mappings[0].url_id # Confirm meets prerequisites diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py index 404f00e1..92add28c 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py @@ -1,15 +1,19 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_error( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 500 error response (or any other error), @@ -28,15 +32,20 @@ async def test_url_probe_task_error( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.SUBMITTED) + url_id: int = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=ValidatedURLType.DATA_SOURCE) + await db_data_creator.create_url_data_sources([url_id]) + assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.SUBMITTED + expected_status=URLStatus.OK ) + + await check_manager.check_web_metadata( url_id=url_id, status_code=500, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py index 97937c15..575ca522 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py @@ -1,15 +1,18 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_not_found( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 404 error response, @@ -29,14 +32,15 @@ async def test_url_probe_task_not_found( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.NOT_RELEVANT) + url_id = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=ValidatedURLType.NOT_RELEVANT) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.NOT_RELEVANT + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index 9d77c26f..cfd1f68f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -32,7 +32,7 @@ async def test_two_urls( ) assert not await operator.meets_task_prerequisites() url_id_1 = await setup_manager.setup_url(URLStatus.OK, url=url_1) - url_id_2 = await setup_manager.setup_url(URLStatus.NOT_RELEVANT, url=url_2) + url_id_2 = await setup_manager.setup_url(URLStatus.OK, url=url_2) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 20671624..b52dce6b 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -29,7 +29,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.INDIVIDUAL_RECORD) + source_url_id = await setup_manager.setup_url(URLStatus.OK) dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( @@ -44,7 +44,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.INDIVIDUAL_RECORD + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index 7d56ddcf..f992fbb6 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -16,9 +16,9 @@ @pytest.mark.asyncio async def test_submit_approved_url_task( - db_data_creator, - mock_pdap_client: PDAPClient, - monkeypatch + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch ): """ The submit_approved_url_task should submit @@ -37,7 +37,7 @@ async def test_submit_approved_url_task( # Create URLs with status 'validated' in database and all requisite URL values # Ensure they have optional metadata as well - urls = await setup_validated_urls(db_data_creator) + urls: list[str] = await setup_validated_urls(db_data_creator) mock_make_request(mock_pdap_client, urls) # Check Task Operator does meet pre-requisites @@ -50,14 +50,14 @@ async def test_submit_approved_url_task( assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message # Get URLs - urls = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") - url_1 = urls[0] - url_2 = urls[1] - url_3 = urls[2] + urls: list[URL] = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") + url_1: URL = urls[0] + url_2: URL = urls[1] + url_3: URL = urls[2] # Check URLs have been marked as 'submitted' - assert url_1.status == URLStatus.SUBMITTED - assert url_2.status == URLStatus.SUBMITTED + assert url_1.status == URLStatus.OK + assert url_2.status == URLStatus.OK assert url_3.status == URLStatus.ERROR # Get URL Data Source Links diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 25289b38..50df6aef 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -12,6 +12,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -84,12 +85,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn urls=[ TestURLCreationParameters( count=3, - status=URLStatus.OK, + status=URLCreationEnum.OK, with_html_content=True ), TestURLCreationParameters( count=1, - status=URLStatus.ERROR, + status=URLCreationEnum.ERROR, with_html_content=False ), ] @@ -104,12 +105,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - pending_url_mappings = creation_info.urls_by_status[URLStatus.OK].url_mappings + pending_url_mappings = creation_info.urls_by_status[URLCreationEnum.OK].url_mappings url_id_success = pending_url_mappings[0].url_id url_id_404 = pending_url_mappings[1].url_id url_id_error = pending_url_mappings[2].url_id - url_id_initial_error = creation_info.urls_by_status[URLStatus.ERROR].url_mappings[0].url_id + url_id_initial_error = creation_info.urls_by_status[URLCreationEnum.ERROR].url_mappings[0].url_id # Check that URLProbedFor404 has been appropriately populated probed_for_404_objects: list[URLProbedFor404] = await db_data_creator.adb_client.get_all(URLProbedFor404) diff --git a/tests/helpers/setup/annotation/core.py b/tests/helpers/setup/annotation/core.py index bbc83bbc..70123cb9 100644 --- a/tests/helpers/setup/annotation/core.py +++ b/tests/helpers/setup/annotation/core.py @@ -1,4 +1,5 @@ from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.setup.annotation.model import AnnotationSetupInfo @@ -6,7 +7,7 @@ async def setup_for_get_next_url_for_annotation( db_data_creator: DBDataCreator, url_count: int, - outcome: URLStatus = URLStatus.OK + outcome: URLCreationEnum = URLCreationEnum.OK ) -> AnnotationSetupInfo: batch_id = db_data_creator.batch() insert_urls_info = db_data_creator.urls( From f47dbeada65992a8b6692819edd7d3a47a815cb7 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Aug 2025 12:13:37 -0400 Subject: [PATCH 05/33] Continue draft --- .../muckrock/api_interface/lookup_response.py | 4 ++-- .../scheduled/impl/sync/agency/operator.py | 13 ++++++----- .../sync/agency/queries/meta_urls/__init__.py | 0 .../sync/agency/queries/meta_urls/convert.py | 9 ++++++++ .../sync/agency/queries/meta_urls/core.py | 22 +++++++++++++++++++ .../queries/meta_urls/lookup/__init__.py | 0 .../agency/queries/meta_urls/lookup/core.py | 15 +++++++++++++ .../queries/meta_urls/lookup/response.py | 10 +++++++++ .../sync/agency/queries/upsert/__init__.py | 0 .../queries/{upsert.py => upsert/convert.py} | 2 +- .../impl/sync/agency/queries/upsert/core.py | 19 ++++++++++++++++ src/db/client/async_.py | 9 ++++---- src/external/pdap/dtos/sync/agencies.py | 3 +++ 13 files changed, 92 insertions(+), 14 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{upsert.py => upsert/convert.py} (97%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py diff --git a/src/collectors/impl/muckrock/api_interface/lookup_response.py b/src/collectors/impl/muckrock/api_interface/lookup_response.py index 47ea855b..d1fd9635 100644 --- a/src/collectors/impl/muckrock/api_interface/lookup_response.py +++ b/src/collectors/impl/muckrock/api_interface/lookup_response.py @@ -6,6 +6,6 @@ class AgencyLookupResponse(BaseModel): - name: Optional[str] + name: str | None type: AgencyLookupResponseType - error: Optional[str] = None + error: str | None = None diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index db20acf1..ad163d5c 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -21,17 +21,19 @@ def task_type(self) -> TaskType: # return TaskType.SYNC_AGENCIES async def inner_task_logic(self): - count_agencies_synced = 0 params = await self.adb_client.get_agencies_sync_parameters() if params.page is None: params.page = 1 response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count = 1 + count_agencies_synced = 0 + request_count = 0 while len(response.agencies) > 0: - check_max_sync_requests_not_exceeded(request_count) await self.adb_client.upsert_agencies(response.agencies) + count_agencies_synced += len(response.agencies) + request_count += 1 + + check_max_sync_requests_not_exceeded(request_count) params = AgencySyncParameters( page=params.page + 1, @@ -40,8 +42,7 @@ async def inner_task_logic(self): await self.adb_client.update_agencies_sync_progress(params.page) response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count += 1 + await self.adb_client.mark_full_agencies_sync() print(f"Sync complete. Synced {count_agencies_synced} agencies") diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py new file mode 100644 index 00000000..36f32111 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py @@ -0,0 +1,9 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_meta_urls_from_agencies_sync_response(responses: list[AgenciesSyncResponseInnerInfo]) -> list[str]: + url_set: set[str] = set() + for response in responses: + for url in response.meta_urls: + url_set.add(url) + return list(url_set) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py new file mode 100644 index 00000000..f28d7f77 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py @@ -0,0 +1,22 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpdateMetaUrlsQueryBuilder(QueryBuilderBase): + """Updates meta URLs for agencies.""" + + def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.agencies = agencies + + async def run(self, session: AsyncSession) -> None: + + # Get existing meta URLs + + # Compare with new meta URLs, separate into add, remove, and do nothing + + # Add new meta URLs + + # Remove old meta URLs \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py new file mode 100644 index 00000000..eecac070 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py @@ -0,0 +1,15 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupURLsQueryBuilder(QueryBuilderBase): + """Look up URLS in database, providing mappings for those that exists.""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[AgencyLookupResponse]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py new file mode 100644 index 00000000..f56d9841 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +class AgencyMetaURLLookupResponse(BaseModel): + url: str + url_id: int | None + agency_ids: list[int] = [] + + @property + def exists_in_db(self) -> bool: + return self.url_id is not None \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py similarity index 97% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py index 61a0b104..4b944464 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py @@ -17,4 +17,4 @@ def convert_agencies_sync_response_to_agencies_upsert( ds_last_updated_at=agency.updated_at ) ) - return results \ No newline at end of file + return results diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py new file mode 100644 index 00000000..0802eb56 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py @@ -0,0 +1,19 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ + convert_agencies_sync_response_to_agencies_upsert +from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + +from src.db.helpers.session import session_helper as sh + +class UpsertAgenciesQueryBuilder(QueryBuilderBase): + + def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.agencies = agencies + + async def run(self, session: AsyncSession) -> None: + agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.agencies) + await sh.bulk_upsert(session=session, models=agency_upserts) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 3af3c8db..5d7ffe0a 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -3,7 +3,7 @@ from operator import or_ from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row +from sqlalchemy import select, exists, func, Select, and_, update, delete, Row from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -31,10 +31,9 @@ from src.api.endpoints.metrics.batches.aggregated.query.core import GetBatchesAggregatedMetricsQueryBuilder from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO from src.api.endpoints.metrics.batches.breakdown.query import GetBatchesBreakdownMetricsQueryBuilder -from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO -from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO, \ - GetMetricsURLsBreakdownPendingResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO, \ GetMetricsURLsBreakdownSubmittedInnerDTO from src.api.endpoints.metrics.urls.aggregated.query.core import GetURLsAggregatedMetricsQueryBuilder @@ -61,7 +60,7 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert import \ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert_.upsert import \ convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ diff --git a/src/external/pdap/dtos/sync/agencies.py b/src/external/pdap/dtos/sync/agencies.py index 99483107..7e569a81 100644 --- a/src/external/pdap/dtos/sync/agencies.py +++ b/src/external/pdap/dtos/sync/agencies.py @@ -3,6 +3,8 @@ from pydantic import BaseModel + + class AgenciesSyncResponseInnerInfo(BaseModel): display_name: str agency_id: int @@ -10,6 +12,7 @@ class AgenciesSyncResponseInnerInfo(BaseModel): county_name: str | None locality_name: str | None updated_at: datetime.datetime + meta_urls: list[str] = [] class AgenciesSyncResponseInfo(BaseModel): agencies: list[AgenciesSyncResponseInnerInfo] From 12eee24612feb208a6d329c8e4563c2e056d6ad8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 26 Aug 2025 08:27:19 -0400 Subject: [PATCH 06/33] Continue draft --- .../sync/agency/queries/meta_urls/convert.py | 11 ++++++----- .../impl/sync/agency/queries/meta_urls/core.py | 11 ++++++++++- .../sync/agency/queries/meta_urls/filter.py | 9 +++++++++ .../agency/queries/meta_urls/lookup/core.py | 11 ++++++----- .../agency/queries/meta_urls/lookup/response.py | 13 ++++++------- .../agency/queries/meta_urls/models/__init__.py | 0 .../meta_urls/models/new_url_agencies.py | 8 ++++++++ .../agency/queries/meta_urls/models/subset.py | 10 ++++++++++ .../sync/agency/queries/meta_urls/requester.py | 17 +++++++++++++++++ src/db/templates/requester.py | 7 ++++++- 10 files changed, 78 insertions(+), 19 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py index 36f32111..87c8fdfa 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py @@ -1,9 +1,10 @@ from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -def extract_meta_urls_from_agencies_sync_response(responses: list[AgenciesSyncResponseInnerInfo]) -> list[str]: - url_set: set[str] = set() +def extract_agency_ids_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[int]: + agency_ids: list[int] = [] for response in responses: - for url in response.meta_urls: - url_set.add(url) - return list(url_set) \ No newline at end of file + agency_ids.append(response.id) + return agency_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py index f28d7f77..24574c15 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py @@ -1,5 +1,9 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.convert import \ + extract_agency_ids_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.requester import UpdateMetaURLsRequester from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -13,10 +17,15 @@ def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): async def run(self, session: AsyncSession) -> None: + requester = UpdateMetaURLsRequester(session) + # Get existing meta URLs + lookup_responses: list[AgencyMetaURLLookupResponse] = \ + await requester.lookup_meta_urls(self.agencies) # Compare with new meta URLs, separate into add, remove, and do nothing # Add new meta URLs - # Remove old meta URLs \ No newline at end of file + # Remove old meta URLs + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py new file mode 100644 index 00000000..67b33454 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py @@ -0,0 +1,9 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.subset import UpdateMetaAgenciesSubset +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def filter_add_and_remove_meta_urls( + lookup_responses: list[AgencyMetaURLLookupResponse], + sync_responses: list[AgenciesSyncResponseInnerInfo] +) -> UpdateMetaAgenciesSubset: \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py index eecac070..111629fa 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py @@ -1,15 +1,16 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse from src.db.queries.base.builder import QueryBuilderBase -class LookupURLsQueryBuilder(QueryBuilderBase): - """Look up URLS in database, providing mappings for those that exists.""" +class LookupAgencyMetaURLsQueryBuilder(QueryBuilderBase): + """Look up agencies in database, noting those that exist and providing associated meta urls.""" - def __init__(self, urls: list[str]): + def __init__(self, agency_ids: list[int]): super().__init__() - self.urls = urls + self.agency_ids = agency_ids - async def run(self, session: AsyncSession) -> list[AgencyLookupResponse]: + async def run(self, session: AsyncSession) -> list[AgencyMetaURLLookupResponse]: raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py index f56d9841..43911ef1 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py @@ -1,10 +1,9 @@ from pydantic import BaseModel -class AgencyMetaURLLookupResponse(BaseModel): - url: str - url_id: int | None - agency_ids: list[int] = [] +from src.db.dtos.url.mapping import URLMapping + - @property - def exists_in_db(self) -> bool: - return self.url_id is not None \ No newline at end of file +class AgencyMetaURLLookupResponse(BaseModel): + agency_id: int + exists_in_db: bool + url_mappings: list[URLMapping] = [] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py new file mode 100644 index 00000000..5016b0a7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + + +class NewURLAgenciesMapping(BaseModel): + """Denote URLs that need to be added to the database, + along with the agencies that should be associated with them.""" + url: str + agency_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py new file mode 100644 index 00000000..ced11c6e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.new_url_agencies import NewURLAgenciesMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic + + +class UpdateMetaAgenciesSubset(BaseModel): + urls_to_add: list[NewURLAgenciesMapping] + links_to_add: list[LinkURLAgencyPydantic] + links_to_remove: list[LinkURLAgencyPydantic] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py new file mode 100644 index 00000000..78f8f0d5 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py @@ -0,0 +1,17 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.convert import \ + extract_agency_ids_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.core import LookupAgencyMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.db.templates.requester import RequesterBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpdateMetaURLsRequester(RequesterBase): + + async def lookup_meta_urls(self, agencies: list[AgenciesSyncResponseInnerInfo]) -> list[AgencyMetaURLLookupResponse]: + agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(agencies) + return await self.run_query_builder( + LookupAgencyMetaURLsQueryBuilder( + agency_ids + ) + ) \ No newline at end of file diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py index d974245e..b56af87f 100644 --- a/src/db/templates/requester.py +++ b/src/db/templates/requester.py @@ -7,9 +7,14 @@ from sqlalchemy.ext.asyncio import AsyncSession import src.db.helpers.session.session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + class RequesterBase(ABC): def __init__(self, session: AsyncSession): self.session = session - self.session_helper = sh \ No newline at end of file + self.session_helper = sh + + async def run_query_builder(self, query_builder: QueryBuilderBase): + return await query_builder.run(session=self.session) \ No newline at end of file From 2f08da161565e96073273de5d30f2287b7929d07 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 26 Aug 2025 11:09:18 -0400 Subject: [PATCH 07/33] Continue draft --- .../sync/agency/queries/meta_urls/extract.py | 12 +++++++ .../sync/agency/queries/meta_urls/filter.py | 33 ++++++++++++++++++- .../sync/agency/queries/meta_urls/mapper.py | 21 ++++++++++++ src/db/dtos/url/mapping.py | 4 ++- 4 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py new file mode 100644 index 00000000..a9daf46f --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.db.dtos.url.mapping import URLMapping + + +def extract_url_mappings_from_agency_meta_url_lookup_response( + lookup_responses: list[AgencyMetaURLLookupResponse] +) -> list[URLMapping]: + url_mappings: set[URLMapping] = set() + for lookup_response in lookup_responses: + for url_mapping in lookup_response.url_mappings: + url_mappings.add(url_mapping) + return list(url_mappings) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py index 67b33454..4ef7fc2f 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py @@ -1,9 +1,40 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.extract import \ + extract_url_mappings_from_agency_meta_url_lookup_response from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.mapper import AgencyIDMetaURLMapper +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.new_url_agencies import NewURLAgenciesMapping from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.subset import UpdateMetaAgenciesSubset +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper def filter_add_and_remove_meta_urls( lookup_responses: list[AgencyMetaURLLookupResponse], sync_responses: list[AgenciesSyncResponseInnerInfo] -) -> UpdateMetaAgenciesSubset: \ No newline at end of file +) -> UpdateMetaAgenciesSubset: + + url_mappings: list[URLMapping] = extract_url_mappings_from_agency_meta_url_lookup_response( + lookup_responses + ) + url_mapper = URLMapper(list(url_mappings)) + + agency_meta_url_mapper = AgencyIDMetaURLMapper( + sync_responses + ) + + urls_to_add: list[NewURLAgenciesMapping] = [] + links_to_add: list[LinkURLAgencyPydantic] = [] + links_to_remove: list[LinkURLAgencyPydantic] = [] + + for lookup_response in lookup_responses: + if lookup_response.exists_in_db: + lookup_response.url_mappings = url_mapper.get_url_mappings( + lookup_response.agency_id + ) + else: + lookup_response.url_mappings = url_mapper.get_url_mappings( + lookup_response.agency_id + ) + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py new file mode 100644 index 00000000..b46608d4 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py @@ -0,0 +1,21 @@ +from collections import defaultdict + +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class AgencyIDMetaURLMapper: + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + self._meta_url_to_agency_id: dict[str, list[int]] = defaultdict(list) + self._agency_id_to_meta_urls: dict[int, list[str]] = defaultdict(list) + for sync_response in sync_responses: + for meta_url in sync_response.meta_urls: + self._meta_url_to_agency_id[meta_url].append(sync_response.agency_id) + self._agency_id_to_meta_urls[sync_response.agency_id].append(meta_url) + + + def get_ids(self, url: str) -> list[int]: + return self._meta_url_to_agency_id[url] + + def get_urls(self, id: int) -> list[str]: + return self._agency_id_to_meta_urls[id] \ No newline at end of file diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping.py index 18fc5be2..d48a4649 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping.py @@ -1,7 +1,9 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class URLMapping(BaseModel): """Mapping between url and url_id.""" + model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable + url: str url_id: int From 497be00dbce3012aed109fceb5ff811041e1c816 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 28 Aug 2025 06:39:34 -0400 Subject: [PATCH 08/33] / --- .../scheduled/impl/sync/agency/operator.py | 7 +++ .../agency/queries/meta_urls/add/__init__.py | 0 .../sync/agency/queries/meta_urls/add/core.py | 46 +++++++++++++++++++ .../sync/agency/queries/meta_urls/core.py | 6 +++ .../sync/agency/queries/meta_urls/filter.py | 28 +++++++++++ .../lookup/link_agency_url/__init__.py | 0 .../queries/meta_urls/lookup/response.py | 4 ++ .../queries/meta_urls/lookup/url/__init__.py | 0 .../queries/meta_urls/lookup/url/core.py | 15 ++++++ .../queries/meta_urls/lookup/url/response.py | 23 ++++++++++ .../queries/meta_urls/update/__init__.py | 0 .../agency/queries/meta_urls/update/core.py | 14 ++++++ 12 files changed, 143 insertions(+) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/link_agency_url/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index ad163d5c..bf692b2d 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -4,6 +4,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo class SyncAgenciesTaskOperator(ScheduledTaskOperatorBase): @@ -47,3 +48,9 @@ async def inner_task_logic(self): await self.adb_client.mark_full_agencies_sync() print(f"Sync complete. Synced {count_agencies_synced} agencies") + async def add_new_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): + # First, add new agencies + await self.adb_client.upsert_agencies(agencies) + + # Then, add new meta urls + raise NotImplementedError diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py new file mode 100644 index 00000000..76146a7e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py @@ -0,0 +1,46 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AddMetaURLsQueryBuilder(QueryBuilderBase): + + """Add Meta URLs to DB with: + - Record type set to CONTACT_INFO_AND_AGENCY_META + - Validation Flag added as META_URL + - Source set to DATA_SOURCES + """ + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + # Add URLs + url_inserts: list[URLInsertModel] = [] + for url in self.urls: + url_inserts.append( + URLInsertModel( + url=url, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + source=URLSource.DATA_SOURCES + ) + ) + url_ids: list[int] = await sh.bulk_insert(session, models=url_inserts, return_ids=True) + + # Add Validation Flags + flag_inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag_inserts.append( + FlagURLValidatedPydantic( + url_id=url_id, + type=ValidatedURLType.META_URL + ) + ) + await sh.bulk_insert(session, models=flag_inserts) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py index 24574c15..01b3c496 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py @@ -19,6 +19,12 @@ async def run(self, session: AsyncSession) -> None: requester = UpdateMetaURLsRequester(session) + # Add new URLs to database + + # Update existing URLs as validated meta URLs + + # Update Agency-URL links + # Get existing meta URLs lookup_responses: list[AgencyMetaURLLookupResponse] = \ await requester.lookup_meta_urls(self.agencies) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py index 4ef7fc2f..c159b47c 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py @@ -29,6 +29,34 @@ def filter_add_and_remove_meta_urls( links_to_remove: list[LinkURLAgencyPydantic] = [] for lookup_response in lookup_responses: + if not lookup_response.exists_in_db: + # All meta_urls in sync must be added + urls_in_sync: list[str] = agency_meta_url_mapper.get_urls( + lookup_response.agency_id + ) + + for url in urls_in_sync: + urls_to_add.append( + NewURLAgenciesMapping( + agency_id=lookup_response.agency_id, + url=url + ) + ) + + # If it already exists in the database, compare the meta_urls and see if they differ + urls_in_db: list[str] = lookup_response.meta_urls + + urls_in_sync: list[str] = agency_meta_url_mapper.get_urls( + lookup_response.agency_id + ) + + in_db_not_sync: list[str] = list(set(urls_in_db) - set(urls_in_sync)) + in_sync_not_db: list[str] = list(set(urls_in_sync) - set(urls_in_db)) + + # For meta_urls in sync but not db, add to urls_to_add + + # For meta_urls in db but not sync, add to links_to_remove + if lookup_response.exists_in_db: lookup_response.url_mappings = url_mapper.get_url_mappings( lookup_response.agency_id diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/link_agency_url/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/link_agency_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py index 43911ef1..d1c1ddeb 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py @@ -7,3 +7,7 @@ class AgencyMetaURLLookupResponse(BaseModel): agency_id: int exists_in_db: bool url_mappings: list[URLMapping] = [] + + @property + def meta_urls(self) -> list[str]: + return [url_mapping.url for url_mapping in self.url_mappings] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py new file mode 100644 index 00000000..7771a6c9 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py @@ -0,0 +1,15 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.url.response import MetaURLLookupResponse +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupMetaURLsQueryBuilder(QueryBuilderBase): + """Lookup whether URLs exist in DB and are validated as meta URLs""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py new file mode 100644 index 00000000..2c6f4b71 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import ValidatedURLType + + +class MetaURLLookupResponse(BaseModel): + url: str + url_id: int | None + record_type: RecordType | None + validation_type: ValidatedURLType | None + + @property + def exists_in_db(self) -> bool: + return self.url_id is not None + + @property + def is_meta_url(self) -> bool: + return self.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META + + @property + def is_validated(self) -> bool: + return self.validation_type is not None \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py new file mode 100644 index 00000000..cbf37b20 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py @@ -0,0 +1,14 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateMetaURLsQueryBuilder(QueryBuilderBase): + """Update meta URLs in DB + + Meta URLs should be given a validation status as a Meta URL + and have their record type updated to CONTACT_INFO_AND_AGENCY_META + """ + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file From fa63ec53f6d175aebfffa198201e3b8853dc8291 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 28 Aug 2025 08:53:56 -0400 Subject: [PATCH 09/33] . --- .../metrics/batches/aggregated/query/core.py | 2 +- .../aggregated/query/rejected/query.py | 4 +-- .../batches/breakdown/not_relevant/cte_.py | 4 +-- .../aggregated/query/subqueries/rejected.py | 4 +-- .../endpoints/review/approve/query_/core.py | 4 +-- src/api/endpoints/review/reject/query.py | 8 ++--- .../impl/huggingface/queries/get/convert.py | 8 ++--- .../impl/huggingface/queries/get/core.py | 8 ++--- .../sync/agency/queries/meta_urls/add/core.py | 4 +-- .../sync/agency/queries/meta_urls/convert.py | 7 +++- .../sync/agency/queries/meta_urls/core.py | 24 +++++++++++-- .../queries/meta_urls/lookup/response.py | 1 + .../queries/meta_urls/lookup/url/response.py | 4 +-- .../agency/queries/meta_urls/requester.py | 14 +++++++- .../agency/queries/meta_urls/update/core.py | 35 +++++++++++++++++-- .../agency/queries/meta_urls/update/filter.py | 17 +++++++++ .../agency/queries/meta_urls/update/params.py | 11 ++++++ .../queries/meta_urls/update/requester.py | 13 +++++++ .../data_sources/queries/upsert/convert.py | 8 ++--- .../queries/upsert/param_manager.py | 4 +-- .../models/impl/flag/url_validated/enums.py | 2 +- .../impl/flag/url_validated/pydantic.py | 4 +-- .../impl/flag/url_validated/sqlalchemy.py | 4 +-- .../url_counts/builder.py | 2 +- .../url_counts/cte/not_relevant.py | 4 +-- .../api/metrics/batches/test_aggregated.py | 6 ++-- .../api/metrics/batches/test_breakdown.py | 8 ++--- .../integration/api/metrics/test_backlog.py | 8 ++--- .../api/metrics/urls/aggregated/test_core.py | 6 ++-- .../rejection/test_individual_record.py | 4 +-- .../api/review/rejection/test_not_relevant.py | 4 +-- .../test_approve_and_get_next_source.py | 4 +-- .../impl/huggingface/setup/queries/convert.py | 8 ++--- .../impl/sync/data_sources/setup/core.py | 4 +-- .../setup/queries/url_/requester.py | 4 +-- .../data_sources/setup/queries/url_/url.py | 4 +-- .../impl/sync/data_sources/test_db_only.py | 2 +- .../data_sources/test_url_broken_approved.py | 4 +-- .../test_url_in_db_overwritten_by_ds.py | 6 ++-- .../sync/data_sources/test_url_ok_approved.py | 4 +-- .../url/impl/probe/no_redirect/test_error.py | 4 +-- .../impl/probe/no_redirect/test_not_found.py | 4 +-- .../commands/impl/urls_/convert.py | 10 +++--- tests/helpers/data_creator/core.py | 8 ++--- tests/helpers/data_creator/create.py | 4 +-- tests/helpers/data_creator/generate.py | 4 +-- 46 files changed, 210 insertions(+), 100 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py index 8ffe3753..2642f002 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/core.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -17,7 +17,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py index d1505f97..6c1d9e0f 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.queries.base.builder import QueryBuilderBase @@ -30,7 +30,7 @@ async def run( FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id ) - .where(FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT) + .where(FlagURLValidated.type == URLValidatedType.NOT_RELEVANT) .group_by(Batch.strategy) ) diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py index 20d32cf1..14403e86 100644 --- a/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py @@ -2,7 +2,7 @@ from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -20,7 +20,7 @@ FlagURLValidated.url_id == LinkBatchURL.url_id ) .where( - FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT ) .group_by(Batch.id) .cte("not_relevant") diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py index e4f6d823..983554ab 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py @@ -1,6 +1,6 @@ from sqlalchemy import select, func -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL @@ -13,6 +13,6 @@ URL.id == FlagURLValidated.url_id, ) .where( - FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT, + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT, ) ) \ No newline at end of file diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index 8af9af03..86c0212c 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -9,7 +9,7 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL @@ -167,6 +167,6 @@ async def _add_validated_flag( ) -> None: flag = FlagURLValidated( url_id=url.id, - type=ValidatedURLType.DATA_SOURCE + type=URLValidatedType.DATA_SOURCE ) session.add(flag) diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index c9593a01..c187a2a8 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.reviewing_user import ReviewingUserURL @@ -35,14 +35,14 @@ async def run(self, session) -> None: url = await session.execute(query) url = url.scalars().first() - validation_type: ValidatedURLType | None = None + validation_type: URLValidatedType | None = None match self.rejection_reason: case RejectionReason.INDIVIDUAL_RECORD: - validation_type = ValidatedURLType.INDIVIDUAL_RECORD + validation_type = URLValidatedType.INDIVIDUAL_RECORD case RejectionReason.BROKEN_PAGE_404: url.status = URLStatus.NOT_FOUND.value case RejectionReason.NOT_RELEVANT: - validation_type = ValidatedURLType.NOT_RELEVANT + validation_type = URLValidatedType.NOT_RELEVANT case _: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py index b9056dcb..5ad96115 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py @@ -1,7 +1,7 @@ from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType def convert_fine_to_coarse_record_type( @@ -11,12 +11,12 @@ def convert_fine_to_coarse_record_type( def convert_validated_type_to_relevant( - validated_type: ValidatedURLType + validated_type: URLValidatedType ) -> bool: match validated_type: - case ValidatedURLType.NOT_RELEVANT: + case URLValidatedType.NOT_RELEVANT: return False - case ValidatedURLType.DATA_SOURCE: + case URLValidatedType.DATA_SOURCE: return True case _: raise ValueError(f"Disallowed validated type: {validated_type}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index f440360c..d58cbdf7 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -6,7 +6,7 @@ from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.db.client.helpers import add_standard_limit_and_offset from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML @@ -47,8 +47,8 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut ) .where( FlagURLValidated.type.in_( - (ValidatedURLType.DATA_SOURCE, - ValidatedURLType.NOT_RELEVANT) + (URLValidatedType.DATA_SOURCE, + URLValidatedType.NOT_RELEVANT) ) ) ) @@ -63,7 +63,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut url_id=result[label_url_id], url=result[label_url], relevant=convert_validated_type_to_relevant( - ValidatedURLType(result[label_type]) + URLValidatedType(result[label_type]) ), record_type_fine=result[label_record_type_fine], record_type_coarse=convert_fine_to_coarse_record_type( diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py index 76146a7e..94ed7481 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py @@ -2,7 +2,7 @@ from src.core.enums import RecordType from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel @@ -40,7 +40,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: flag_inserts.append( FlagURLValidatedPydantic( url_id=url_id, - type=ValidatedURLType.META_URL + type=URLValidatedType.META_URL ) ) await sh.bulk_insert(session, models=flag_inserts) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py index 87c8fdfa..309b537e 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py @@ -1,3 +1,4 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -7,4 +8,8 @@ def extract_agency_ids_from_agencies_sync_response( agency_ids: list[int] = [] for response in responses: agency_ids.append(response.id) - return agency_ids \ No newline at end of file + return agency_ids + + +def convert_to_update_meta_urls_params(agencies: list[AgenciesSyncResponseInnerInfo]) -> list[UpdateMetaURLsParams]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py index 01b3c496..02943e94 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py @@ -3,7 +3,9 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.convert import \ extract_agency_ids_from_agencies_sync_response from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.url.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.requester import UpdateMetaURLsRequester +from src.db.dtos.url.mapping import URLMapping from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -13,13 +15,31 @@ class UpdateMetaUrlsQueryBuilder(QueryBuilderBase): def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): super().__init__() - self.agencies = agencies + self.responses = agencies async def run(self, session: AsyncSession) -> None: requester = UpdateMetaURLsRequester(session) + # Get URLs to Add + lookup_responses: list[MetaURLLookupResponse] = await requester.lookup_meta_urls(self.responses) + + urls_to_add: list[str] = filter_urls_to_add(lookup_responses) + # Add new URLs to database + new_url_mappings: list[URLMapping] = await requester.add_meta_urls(urls_to_add) + existing_url_mappings: list[URLMapping] = filter_existing_url_mappings(lookup_responses) + + all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings + + + + + + # Update existing URLs + + + # Update existing URLs as validated meta URLs @@ -27,7 +47,7 @@ async def run(self, session: AsyncSession) -> None: # Get existing meta URLs lookup_responses: list[AgencyMetaURLLookupResponse] = \ - await requester.lookup_meta_urls(self.agencies) + await requester.lookup_meta_urls(self.responses) # Compare with new meta URLs, separate into add, remove, and do nothing diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py index d1c1ddeb..51eb9b2c 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py @@ -1,6 +1,7 @@ from pydantic import BaseModel from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType class AgencyMetaURLLookupResponse(BaseModel): diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py index 2c6f4b71..4e14cb53 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py @@ -1,14 +1,14 @@ from pydantic import BaseModel from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType class MetaURLLookupResponse(BaseModel): url: str url_id: int | None record_type: RecordType | None - validation_type: ValidatedURLType | None + validation_type: URLValidatedType | None @property def exists_in_db(self) -> bool: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py index 78f8f0d5..46698832 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py @@ -14,4 +14,16 @@ async def lookup_meta_urls(self, agencies: list[AgenciesSyncResponseInnerInfo]) LookupAgencyMetaURLsQueryBuilder( agency_ids ) - ) \ No newline at end of file + ) + + async def add_meta_urls(self) -> None: + raise NotImplementedError + + async def update_meta_urls(self) -> None: + raise NotImplementedError + + async def add_agency_url_links(self) -> None: + raise NotImplementedError + + async def remove_agency_url_links(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py index cbf37b20..952f87f3 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py @@ -1,5 +1,11 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.filter import \ + filter_urls_with_non_meta_record_type, filter_urls_with_non_meta_url_validation_flag, \ + filter_urls_without_validation_flag +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.requester import UpdateMetaURLsRequester, \ + UpdateMetaURLsUpdateURLAndValidationFlagsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -10,5 +16,30 @@ class UpdateMetaURLsQueryBuilder(QueryBuilderBase): and have their record type updated to CONTACT_INFO_AND_AGENCY_META """ - async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + def __init__( + self, + params: list[UpdateMetaURLsParams] + ): + super().__init__() + self.params = params + + async def run( + self, + session: AsyncSession + ) -> None: + requester = UpdateMetaURLsUpdateURLAndValidationFlagsRequester(session) + + urls_with_non_meta_record_type: list[int] = filter_urls_with_non_meta_record_type(self.params) + await requester.update_urls(urls_with_non_meta_record_type) + + urls_without_validation_flag: list[int] = filter_urls_without_validation_flag(self.params) + await requester.add_validation_flags(urls_without_validation_flag) + + urls_with_non_meta_url_validation_flag: list[int] = filter_urls_with_non_meta_url_validation_flag(self.params) + await requester.update_validation_flags(urls_with_non_meta_url_validation_flag) + + + + + + raise NotImplementedError diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py new file mode 100644 index 00000000..41a0f5ee --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py @@ -0,0 +1,17 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams + + +def filter_urls_with_non_meta_record_type( + params: list[UpdateMetaURLsParams] +) -> list[int]: + raise NotImplementedError + +def filter_urls_without_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + raise NotImplementedError + +def filter_urls_with_non_meta_url_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py new file mode 100644 index 00000000..cb74a378 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +class UpdateMetaURLsParams(BaseModel): + validation_type: URLValidatedType | None + url_id: int + record_type: RecordType | None + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py new file mode 100644 index 00000000..80233975 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py @@ -0,0 +1,13 @@ +from src.db.templates.requester import RequesterBase + + +class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): + + async def update_validation_flags(self, url_ids: list[int]) -> None: + raise NotImplementedError + + async def add_validation_flags(self, url_ids: list[int]) -> None: + raise NotImplementedError + + async def update_urls(self, url_ids: list[int]) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py index 7e131b89..e2def8c2 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py @@ -1,6 +1,6 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import URLDataSyncInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.external.pdap.enums import ApprovalStatus @@ -14,11 +14,11 @@ def convert_url_sync_info_to_url_mappings( def convert_approval_status_to_validated_type( approval_status: ApprovalStatus -) -> ValidatedURLType: +) -> URLValidatedType: match approval_status: case ApprovalStatus.APPROVED: - return ValidatedURLType.DATA_SOURCE + return URLValidatedType.DATA_SOURCE case ApprovalStatus.REJECTED: - return ValidatedURLType.NOT_RELEVANT + return URLValidatedType.NOT_RELEVANT case _: raise ValueError(f"Invalid approval status: {approval_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index 6493d3c8..5c57474d 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -12,7 +12,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic @@ -116,7 +116,7 @@ def upsert_validated_flags( url_id: int = mapper.get_id(url) sync_info: DataSourcesSyncResponseInnerInfo = self._mapper.get(url) approval_status: ApprovalStatus = sync_info.approval_status - validated_type: ValidatedURLType = convert_approval_status_to_validated_type(approval_status) + validated_type: URLValidatedType = convert_approval_status_to_validated_type(approval_status) flag = FlagURLValidatedPydantic( url_id=url_id, type=validated_type diff --git a/src/db/models/impl/flag/url_validated/enums.py b/src/db/models/impl/flag/url_validated/enums.py index a0228ee1..fe74b84c 100644 --- a/src/db/models/impl/flag/url_validated/enums.py +++ b/src/db/models/impl/flag/url_validated/enums.py @@ -1,7 +1,7 @@ from enum import Enum -class ValidatedURLType(Enum): +class URLValidatedType(Enum): DATA_SOURCE = "data source" META_URL = "meta url" NOT_RELEVANT = "not relevant" diff --git a/src/db/models/impl/flag/url_validated/pydantic.py b/src/db/models/impl/flag/url_validated/pydantic.py index ccf3a110..197c05a0 100644 --- a/src/db/models/impl/flag/url_validated/pydantic.py +++ b/src/db/models/impl/flag/url_validated/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel @@ -11,7 +11,7 @@ class FlagURLValidatedPydantic( ): url_id: int - type: ValidatedURLType + type: URLValidatedType @classmethod def sa_model(cls) -> type_[FlagURLValidated]: diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py index 9d0528ab..f6d4e770 100644 --- a/src/db/models/impl/flag/url_validated/sqlalchemy.py +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -1,7 +1,7 @@ from sqlalchemy import PrimaryKeyConstraint from src.db.models.helpers import enum_column -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin from src.db.models.templates_.base import Base @@ -20,6 +20,6 @@ class FlagURLValidated( ) type = enum_column( - enum_type=ValidatedURLType, + enum_type=URLValidatedType, name="validated_url_type", ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index afbd4477..634cf419 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -3,7 +3,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py index cbb55369..e84f597b 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py @@ -1,7 +1,7 @@ from sqlalchemy import select, func from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL @@ -26,7 +26,7 @@ FlagURLValidated.url_id == URL.id, ) .where( - FlagURLValidated.type == ValidatedURLType.NOT_RELEVANT + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT ) .group_by( Batch.id diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 3121dd4e..306160fa 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -4,7 +4,7 @@ from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ create_batch_url_links, create_validated_flags @@ -46,12 +46,12 @@ async def test_get_batches_aggregated_metrics( await create_validated_flags( adb_client=adb_client, url_ids=urls_validated + urls_submitted, - validation_type=ValidatedURLType.DATA_SOURCE, + validation_type=URLValidatedType.DATA_SOURCE, ) await create_validated_flags( adb_client=adb_client, url_ids=urls_not_relevant, - validation_type=ValidatedURLType.NOT_RELEVANT, + validation_type=URLValidatedType.NOT_RELEVANT, ) await create_url_data_sources( adb_client=adb_client, diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index a75979ea..455d9399 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -6,7 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ create_url_data_sources @@ -30,7 +30,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): await create_validated_flags( adb_client=adb_client, url_ids=url_ids_1[:2], - validation_type=ValidatedURLType.DATA_SOURCE + validation_type=URLValidatedType.DATA_SOURCE ) await create_url_data_sources( adb_client=adb_client, @@ -60,12 +60,12 @@ async def test_get_batches_breakdown_metrics(api_test_helper): await create_validated_flags( adb_client=adb_client, url_ids=validated_url_ids[:3], - validation_type=ValidatedURLType.NOT_RELEVANT, + validation_type=URLValidatedType.NOT_RELEVANT, ) await create_validated_flags( adb_client=adb_client, url_ids=validated_url_ids[4:9], - validation_type=ValidatedURLType.DATA_SOURCE, + validation_type=URLValidatedType.DATA_SOURCE, ) await create_batch_url_links( adb_client=adb_client, diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index d39d0640..9fe7a45c 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import SuggestedStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -29,7 +29,7 @@ async def test_get_backlog_metrics(api_test_helper): submitted_url_ids_1: list[int] = url_ids_1[:2] await ddc.create_validated_flags( url_ids=submitted_url_ids_1, - validation_type=ValidatedURLType.DATA_SOURCE + validation_type=URLValidatedType.DATA_SOURCE ) await ddc.create_url_data_sources(url_ids=submitted_url_ids_1) @@ -46,7 +46,7 @@ async def test_get_backlog_metrics(api_test_helper): await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) await ddc.create_validated_flags( url_ids=not_relevant_url_ids_2[:4], - validation_type=ValidatedURLType.NOT_RELEVANT + validation_type=URLValidatedType.NOT_RELEVANT ) error_url_ids_2: list[int] = await ddc.create_urls( status=URLStatus.ERROR, @@ -67,7 +67,7 @@ async def test_get_backlog_metrics(api_test_helper): await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) await ddc.create_validated_flags( url_ids=url_ids_3[:5], - validation_type=ValidatedURLType.DATA_SOURCE + validation_type=URLValidatedType.DATA_SOURCE ) diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 49f63cf4..f22ec757 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -4,7 +4,7 @@ import pytest from src.collectors.enums import CollectorType, URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -47,8 +47,8 @@ async def test_get_urls_aggregated_metrics(api_test_helper): ) url_ids_2_ok: list[int] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) url_ids_2_error: list[int] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) - url_ids_2_validated: list[int] = await ddc.create_validated_urls(count=1, validation_type=ValidatedURLType.DATA_SOURCE) - url_ids_2_not_relevant: list[int] = await ddc.create_validated_urls(count=5, validation_type=ValidatedURLType.NOT_RELEVANT) + url_ids_2_validated: list[int] = await ddc.create_validated_urls(count=1, validation_type=URLValidatedType.DATA_SOURCE) + url_ids_2_not_relevant: list[int] = await ddc.create_validated_urls(count=5, validation_type=URLValidatedType.NOT_RELEVANT) await ddc.create_batch_url_links( url_ids=url_ids_2_validated + url_ids_2_not_relevant, batch_id=batch_2 diff --git a/tests/automated/integration/api/review/rejection/test_individual_record.py b/tests/automated/integration/api/review/rejection/test_individual_record.py index ec96819a..33addd91 100644 --- a/tests/automated/integration/api/review/rejection/test_individual_record.py +++ b/tests/automated/integration/api/review/rejection/test_individual_record.py @@ -2,7 +2,7 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test from tests.helpers.api_test_helper import APITestHelper @@ -18,5 +18,5 @@ async def test_rejection_individual_record(api_test_helper: APITestHelper): # Get FlagURLValidated and confirm Individual Record flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] - assert flag.type == ValidatedURLType.INDIVIDUAL_RECORD + assert flag.type == URLValidatedType.INDIVIDUAL_RECORD diff --git a/tests/automated/integration/api/review/rejection/test_not_relevant.py b/tests/automated/integration/api/review/rejection/test_not_relevant.py index 7b6154e1..03ee72d3 100644 --- a/tests/automated/integration/api/review/rejection/test_not_relevant.py +++ b/tests/automated/integration/api/review/rejection/test_not_relevant.py @@ -2,7 +2,7 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test @@ -17,4 +17,4 @@ async def test_rejection_not_relevant(api_test_helper): # Get FlagURLValidated and confirm Not Relevant flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] - assert flag.type == ValidatedURLType.NOT_RELEVANT \ No newline at end of file + assert flag.type == URLValidatedType.NOT_RELEVANT \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index fab8a1a0..69cf13d2 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,7 +6,7 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL @@ -82,4 +82,4 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): # Confirm presence of FlagURLValidated flag_url_validated = await adb_client.get_all(FlagURLValidated) assert len(flag_url_validated) == 1 - assert flag_url_validated[0].type == ValidatedURLType.DATA_SOURCE \ No newline at end of file + assert flag_url_validated[0].type == URLValidatedType.DATA_SOURCE \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py index d0f2fea0..2fb5b2d0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py @@ -1,14 +1,14 @@ -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ PushToHuggingFaceTestSetupStatusEnum def convert_test_status_to_validated_status( status: PushToHuggingFaceTestSetupStatusEnum -) -> ValidatedURLType: +) -> URLValidatedType: match status: case PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE: - return ValidatedURLType.DATA_SOURCE + return URLValidatedType.DATA_SOURCE case PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT: - return ValidatedURLType.NOT_RELEVANT + return URLValidatedType.NOT_RELEVANT case _: raise ValueError(f"Invalid test status for function: {status}") \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py index d07ba838..f7cd3337 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus @@ -41,7 +41,7 @@ def set_up_mock_pdap_client_responses( async def set_up_urls( adb_client: AsyncDatabaseClient, record_type: RecordType, - validated_type: ValidatedURLType | None = None, + validated_type: URLValidatedType | None = None, previously_synced: bool = False, ) -> list[int]: """Creates 2 test URLs.""" diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py index 4c3c4f38..a514b151 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py @@ -1,7 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel @@ -32,7 +32,7 @@ async def insert_urls( async def insert_validated_flags( self, url_ids: list[int], - validated_type: ValidatedURLType + validated_type: URLValidatedType ) -> None: to_insert: list[FlagURLValidatedPydantic] = [] for url_id in url_ids: diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py index 47b859e3..0176a95f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py @@ -1,7 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.queries.base.builder import QueryBuilderBase from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.requester import \ TestDataSourcesSyncURLSetupQueryRequester @@ -12,7 +12,7 @@ class TestDataSourcesSyncURLSetupQueryBuilder(QueryBuilderBase): def __init__( self, record_type: RecordType, - validated_type: ValidatedURLType | None = None, + validated_type: URLValidatedType | None = None, previously_synced: bool = False, ): super().__init__() diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py index 685132df..87cf163a 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py @@ -8,7 +8,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py index e7a9a5a0..7878c83f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py @@ -8,7 +8,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL @@ -72,7 +72,7 @@ async def test_url_broken_approved( # Confirm presence of validated flag flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) assert len(flags) == 2 - assert all([flag.type == ValidatedURLType.DATA_SOURCE for flag in flags]) + assert all([flag.type == URLValidatedType.DATA_SOURCE for flag in flags]) assert set(flag.url_id for flag in flags) == set(url_ids) # Confirm presence of sync status row diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py index a1e0bf2c..e1c7f33c 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py @@ -5,7 +5,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL @@ -33,7 +33,7 @@ async def test_url_in_db_overwritten_by_ds( url_ids: list[int] = await set_up_urls( adb_client=adb_client_test, record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, - validated_type=ValidatedURLType.DATA_SOURCE, + validated_type=URLValidatedType.DATA_SOURCE, ) # Link URLs to 2 existing agencies links: list[LinkURLAgency] = [] @@ -89,6 +89,6 @@ async def test_url_in_db_overwritten_by_ds( # Confirm validated types overwritten flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) assert len(flags) == 2 - assert all([flag.type == ValidatedURLType.NOT_RELEVANT for flag in flags]) + assert all([flag.type == URLValidatedType.NOT_RELEVANT for flag in flags]) assert set(flag.url_id for flag in flags) == set(url_ids) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py index bc55a5be..eeff4028 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py @@ -5,7 +5,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus @@ -59,5 +59,5 @@ async def test_url_ok_approved( # Confirm presence of validated flag flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) assert len(flags) == 2 - assert all([flag.type == ValidatedURLType.DATA_SOURCE for flag in flags]) + assert all([flag.type == URLValidatedType.DATA_SOURCE for flag in flags]) assert set(flag.url_id for flag in flags) == set(url_ids) diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py index 92add28c..e788fff1 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py @@ -1,7 +1,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager @@ -33,7 +33,7 @@ async def test_url_probe_task_error( ) assert not await operator.meets_task_prerequisites() url_id: int = await setup_manager.setup_url(URLStatus.OK) - await db_data_creator.create_validated_flags([url_id], validation_type=ValidatedURLType.DATA_SOURCE) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.DATA_SOURCE) await db_data_creator.create_url_data_sources([url_id]) assert await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py index 575ca522..7fc54da4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py @@ -1,7 +1,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -33,7 +33,7 @@ async def test_url_probe_task_not_found( ) assert not await operator.meets_task_prerequisites() url_id = await setup_manager.setup_url(URLStatus.OK) - await db_data_creator.create_validated_flags([url_id], validation_type=ValidatedURLType.NOT_RELEVANT) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.NOT_RELEVANT) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py index 32ec321a..d76edfe5 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/convert.py +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -1,5 +1,5 @@ from src.collectors.enums import URLStatus -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -24,13 +24,13 @@ def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) def convert_url_creation_enum_to_validated_type( url_creation_enum: URLCreationEnum -) -> ValidatedURLType: +) -> URLValidatedType: match url_creation_enum: case URLCreationEnum.SUBMITTED: - return ValidatedURLType.DATA_SOURCE + return URLValidatedType.DATA_SOURCE case URLCreationEnum.VALIDATED: - return ValidatedURLType.DATA_SOURCE + return URLValidatedType.DATA_SOURCE case URLCreationEnum.NOT_RELEVANT: - return ValidatedURLType.NOT_RELEVANT + return URLValidatedType.NOT_RELEVANT case _: raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 389b6f66..93328162 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -7,7 +7,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient @@ -377,7 +377,7 @@ async def url_metadata( async def create_validated_urls( self, record_type: RecordType = RecordType.RESOURCES, - validation_type: ValidatedURLType = ValidatedURLType.DATA_SOURCE, + validation_type: URLValidatedType = URLValidatedType.DATA_SOURCE, count: int = 1 ) -> list[int]: url_ids: list[int] = await self.create_urls( @@ -401,7 +401,7 @@ async def create_submitted_urls( ) await self.create_validated_flags( url_ids=url_ids, - validation_type=ValidatedURLType.DATA_SOURCE + validation_type=URLValidatedType.DATA_SOURCE ) await self.create_url_data_sources(url_ids=url_ids) return url_ids @@ -457,7 +457,7 @@ async def create_batch_url_links( async def create_validated_flags( self, url_ids: list[int], - validation_type: ValidatedURLType, + validation_type: URLValidatedType, ): return await create_validated_flags( adb_client=self.adb_client, diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index af927b98..f2bf2c97 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -4,7 +4,7 @@ from src.core.enums import BatchStatus, RecordType from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.batch.pydantic.insert import BatchInsertModel -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic from src.db.models.impl.url.core.enums import URLSource @@ -41,7 +41,7 @@ async def create_urls( async def create_validated_flags( adb_client: AsyncDatabaseClient, url_ids: list[int], - validation_type: ValidatedURLType, + validation_type: URLValidatedType, ) -> None: validated_flags: list[FlagURLValidatedPydantic] = generate_validated_flags( url_ids=url_ids, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index 5caf4d2c..efea01cc 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -3,7 +3,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus, RecordType from src.db.models.impl.batch.pydantic.insert import BatchInsertModel -from src.db.models.impl.flag.url_validated.enums import ValidatedURLType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic @@ -58,7 +58,7 @@ def generate_urls( def generate_validated_flags( url_ids: list[int], - validation_type: ValidatedURLType, + validation_type: URLValidatedType, ) -> list[FlagURLValidatedPydantic]: return [ FlagURLValidatedPydantic( From 4968ab16a3ea30f89e00b92504299afe760fc28f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 29 Aug 2025 06:49:47 -0400 Subject: [PATCH 10/33] Add draft of Meta URL sync logic --- .../scheduled/impl/sync/agency/operator.py | 10 +- .../sync/agency/queries/meta_urls/convert.py | 15 --- .../sync/agency/queries/meta_urls/core.py | 57 ------------ .../sync/agency/queries/meta_urls/extract.py | 12 --- .../sync/agency/queries/meta_urls/filter.py | 68 -------------- .../agency/queries/meta_urls/lookup/core.py | 16 ---- .../queries/meta_urls/lookup/response.py | 14 --- .../queries/meta_urls/lookup/url/core.py | 15 --- .../meta_urls/models/new_url_agencies.py | 8 -- .../agency/queries/meta_urls/models/subset.py | 10 -- .../agency/queries/meta_urls/requester.py | 29 ------ .../queries/meta_urls/update/__init__.py | 0 .../agency/queries/meta_urls/update/filter.py | 17 ---- .../queries/meta_urls/update/requester.py | 13 --- .../impl/sync/agency/queries/upsert/core.py | 18 +++- .../{meta_urls => upsert/links}/__init__.py | 0 .../sync/agency/queries/upsert/links/core.py | 49 ++++++++++ .../agency/queries/upsert/links/filter.py | 40 ++++++++ .../add => upsert/links/lookup}/__init__.py | 0 .../queries/upsert/links/lookup/core.py | 54 +++++++++++ .../agency/queries/upsert/links/requester.py | 19 ++++ .../agency/queries/upsert/links/subsets.py | 8 ++ .../lookup => upsert/meta_urls}/__init__.py | 0 .../meta_urls/add}/__init__.py | 0 .../{ => upsert}/meta_urls/add/core.py | 11 +++ .../queries/upsert/meta_urls/convert.py | 27 ++++++ .../agency/queries/upsert/meta_urls/core.py | 55 +++++++++++ .../queries/upsert/meta_urls/extract.py | 12 +++ .../agency/queries/upsert/meta_urls/filter.py | 20 ++++ .../meta_urls/lookup}/__init__.py | 0 .../queries/upsert/meta_urls/lookup/core.py | 46 +++++++++ .../meta_urls/lookup}/response.py | 2 +- .../queries/{ => upsert}/meta_urls/mapper.py | 9 +- .../queries/upsert/meta_urls/requester.py | 40 ++++++++ .../queries/upsert/meta_urls/response.py | 6 ++ .../meta_urls/update}/__init__.py | 0 .../{ => upsert}/meta_urls/update/core.py | 12 +-- .../queries/upsert/meta_urls/update/filter.py | 37 ++++++++ .../{ => upsert}/meta_urls/update/params.py | 0 .../upsert/meta_urls/update/requester.py | 53 +++++++++++ .../queries/upsert/agency/core.py | 93 +++++++++++++++++-- .../queries/upsert/agency/params.py | 2 +- .../queries/upsert/agency/query.py | 79 ---------------- .../queries/upsert/param_manager.py | 6 +- .../data_sources/queries/upsert/requester.py | 6 +- src/db/helpers/session/session_helper.py | 2 +- 46 files changed, 597 insertions(+), 393 deletions(-) delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls => upsert/links}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/add => upsert/links/lookup}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/lookup => upsert/meta_urls}/__init__.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/lookup/link_agency_url => upsert/meta_urls/add}/__init__.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/{ => upsert}/meta_urls/add/core.py (87%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/lookup/url => upsert/meta_urls/lookup}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/lookup/url => upsert/meta_urls/lookup}/response.py (92%) rename src/core/tasks/scheduled/impl/sync/agency/queries/{ => upsert}/meta_urls/mapper.py (77%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{meta_urls/models => upsert/meta_urls/update}/__init__.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/{ => upsert}/meta_urls/update/core.py (79%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py rename src/core/tasks/scheduled/impl/sync/agency/queries/{ => upsert}/meta_urls/update/params.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py delete mode 100644 src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index bf692b2d..1962eaa7 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -1,3 +1,4 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.core import UpsertAgenciesQueryBuilder from src.core.tasks.scheduled.impl.sync.check import check_max_sync_requests_not_exceeded from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase @@ -48,9 +49,8 @@ async def inner_task_logic(self): await self.adb_client.mark_full_agencies_sync() print(f"Sync complete. Synced {count_agencies_synced} agencies") - async def add_new_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): + async def update_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): # First, add new agencies - await self.adb_client.upsert_agencies(agencies) - - # Then, add new meta urls - raise NotImplementedError + await self.adb_client.run_query_builder( + UpsertAgenciesQueryBuilder(agencies) + ) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py deleted file mode 100644 index 309b537e..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/convert.py +++ /dev/null @@ -1,15 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -def extract_agency_ids_from_agencies_sync_response( - responses: list[AgenciesSyncResponseInnerInfo] -) -> list[int]: - agency_ids: list[int] = [] - for response in responses: - agency_ids.append(response.id) - return agency_ids - - -def convert_to_update_meta_urls_params(agencies: list[AgenciesSyncResponseInnerInfo]) -> list[UpdateMetaURLsParams]: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py deleted file mode 100644 index 02943e94..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/core.py +++ /dev/null @@ -1,57 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.convert import \ - extract_agency_ids_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.url.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.requester import UpdateMetaURLsRequester -from src.db.dtos.url.mapping import URLMapping -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class UpdateMetaUrlsQueryBuilder(QueryBuilderBase): - """Updates meta URLs for agencies.""" - - def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): - super().__init__() - self.responses = agencies - - async def run(self, session: AsyncSession) -> None: - - requester = UpdateMetaURLsRequester(session) - - # Get URLs to Add - lookup_responses: list[MetaURLLookupResponse] = await requester.lookup_meta_urls(self.responses) - - urls_to_add: list[str] = filter_urls_to_add(lookup_responses) - - # Add new URLs to database - new_url_mappings: list[URLMapping] = await requester.add_meta_urls(urls_to_add) - existing_url_mappings: list[URLMapping] = filter_existing_url_mappings(lookup_responses) - - all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings - - - - - - # Update existing URLs - - - - - # Update existing URLs as validated meta URLs - - # Update Agency-URL links - - # Get existing meta URLs - lookup_responses: list[AgencyMetaURLLookupResponse] = \ - await requester.lookup_meta_urls(self.responses) - - # Compare with new meta URLs, separate into add, remove, and do nothing - - # Add new meta URLs - - # Remove old meta URLs - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py deleted file mode 100644 index a9daf46f..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/extract.py +++ /dev/null @@ -1,12 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse -from src.db.dtos.url.mapping import URLMapping - - -def extract_url_mappings_from_agency_meta_url_lookup_response( - lookup_responses: list[AgencyMetaURLLookupResponse] -) -> list[URLMapping]: - url_mappings: set[URLMapping] = set() - for lookup_response in lookup_responses: - for url_mapping in lookup_response.url_mappings: - url_mappings.add(url_mapping) - return list(url_mappings) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py deleted file mode 100644 index c159b47c..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/filter.py +++ /dev/null @@ -1,68 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.extract import \ - extract_url_mappings_from_agency_meta_url_lookup_response -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.mapper import AgencyIDMetaURLMapper -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.new_url_agencies import NewURLAgenciesMapping -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.subset import UpdateMetaAgenciesSubset -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from src.util.url_mapper import URLMapper - - -def filter_add_and_remove_meta_urls( - lookup_responses: list[AgencyMetaURLLookupResponse], - sync_responses: list[AgenciesSyncResponseInnerInfo] -) -> UpdateMetaAgenciesSubset: - - url_mappings: list[URLMapping] = extract_url_mappings_from_agency_meta_url_lookup_response( - lookup_responses - ) - url_mapper = URLMapper(list(url_mappings)) - - agency_meta_url_mapper = AgencyIDMetaURLMapper( - sync_responses - ) - - urls_to_add: list[NewURLAgenciesMapping] = [] - links_to_add: list[LinkURLAgencyPydantic] = [] - links_to_remove: list[LinkURLAgencyPydantic] = [] - - for lookup_response in lookup_responses: - if not lookup_response.exists_in_db: - # All meta_urls in sync must be added - urls_in_sync: list[str] = agency_meta_url_mapper.get_urls( - lookup_response.agency_id - ) - - for url in urls_in_sync: - urls_to_add.append( - NewURLAgenciesMapping( - agency_id=lookup_response.agency_id, - url=url - ) - ) - - # If it already exists in the database, compare the meta_urls and see if they differ - urls_in_db: list[str] = lookup_response.meta_urls - - urls_in_sync: list[str] = agency_meta_url_mapper.get_urls( - lookup_response.agency_id - ) - - in_db_not_sync: list[str] = list(set(urls_in_db) - set(urls_in_sync)) - in_sync_not_db: list[str] = list(set(urls_in_sync) - set(urls_in_db)) - - # For meta_urls in sync but not db, add to urls_to_add - - # For meta_urls in db but not sync, add to links_to_remove - - if lookup_response.exists_in_db: - lookup_response.url_mappings = url_mapper.get_url_mappings( - lookup_response.agency_id - ) - else: - lookup_response.url_mappings = url_mapper.get_url_mappings( - lookup_response.agency_id - ) - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py deleted file mode 100644 index 111629fa..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/core.py +++ /dev/null @@ -1,16 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse -from src.db.queries.base.builder import QueryBuilderBase - - -class LookupAgencyMetaURLsQueryBuilder(QueryBuilderBase): - """Look up agencies in database, noting those that exist and providing associated meta urls.""" - - def __init__(self, agency_ids: list[int]): - super().__init__() - self.agency_ids = agency_ids - - async def run(self, session: AsyncSession) -> list[AgencyMetaURLLookupResponse]: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py deleted file mode 100644 index 51eb9b2c..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/response.py +++ /dev/null @@ -1,14 +0,0 @@ -from pydantic import BaseModel - -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import URLValidatedType - - -class AgencyMetaURLLookupResponse(BaseModel): - agency_id: int - exists_in_db: bool - url_mappings: list[URLMapping] = [] - - @property - def meta_urls(self) -> list[str]: - return [url_mapping.url for url_mapping in self.url_mappings] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py deleted file mode 100644 index 7771a6c9..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/core.py +++ /dev/null @@ -1,15 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.url.response import MetaURLLookupResponse -from src.db.queries.base.builder import QueryBuilderBase - - -class LookupMetaURLsQueryBuilder(QueryBuilderBase): - """Lookup whether URLs exist in DB and are validated as meta URLs""" - - def __init__(self, urls: list[str]): - super().__init__() - self.urls = urls - - async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py deleted file mode 100644 index 5016b0a7..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/new_url_agencies.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import BaseModel - - -class NewURLAgenciesMapping(BaseModel): - """Denote URLs that need to be added to the database, - along with the agencies that should be associated with them.""" - url: str - agency_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py deleted file mode 100644 index ced11c6e..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/subset.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.models.new_url_agencies import NewURLAgenciesMapping -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic - - -class UpdateMetaAgenciesSubset(BaseModel): - urls_to_add: list[NewURLAgenciesMapping] - links_to_add: list[LinkURLAgencyPydantic] - links_to_remove: list[LinkURLAgencyPydantic] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py deleted file mode 100644 index 46698832..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/requester.py +++ /dev/null @@ -1,29 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.convert import \ - extract_agency_ids_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.core import LookupAgencyMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.lookup.response import AgencyMetaURLLookupResponse -from src.db.templates.requester import RequesterBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class UpdateMetaURLsRequester(RequesterBase): - - async def lookup_meta_urls(self, agencies: list[AgenciesSyncResponseInnerInfo]) -> list[AgencyMetaURLLookupResponse]: - agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(agencies) - return await self.run_query_builder( - LookupAgencyMetaURLsQueryBuilder( - agency_ids - ) - ) - - async def add_meta_urls(self) -> None: - raise NotImplementedError - - async def update_meta_urls(self) -> None: - raise NotImplementedError - - async def add_agency_url_links(self) -> None: - raise NotImplementedError - - async def remove_agency_url_links(self) -> None: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py deleted file mode 100644 index 41a0f5ee..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/filter.py +++ /dev/null @@ -1,17 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams - - -def filter_urls_with_non_meta_record_type( - params: list[UpdateMetaURLsParams] -) -> list[int]: - raise NotImplementedError - -def filter_urls_without_validation_flag( - params: list[UpdateMetaURLsParams] -) -> list[int]: - raise NotImplementedError - -def filter_urls_with_non_meta_url_validation_flag( - params: list[UpdateMetaURLsParams] -) -> list[int]: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py deleted file mode 100644 index 80233975..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/requester.py +++ /dev/null @@ -1,13 +0,0 @@ -from src.db.templates.requester import RequesterBase - - -class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): - - async def update_validation_flags(self, url_ids: list[int]) -> None: - raise NotImplementedError - - async def add_validation_flags(self, url_ids: list[int]) -> None: - raise NotImplementedError - - async def update_urls(self, url_ids: list[int]) -> None: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py index 0802eb56..dc7ba155 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py @@ -1,5 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.core import UpdateAgencyURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.core import UpsertMetaUrlsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ convert_agencies_sync_response_to_agencies_upsert from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel @@ -10,10 +13,19 @@ class UpsertAgenciesQueryBuilder(QueryBuilderBase): - def __init__(self, agencies: list[AgenciesSyncResponseInnerInfo]): + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): super().__init__() - self.agencies = agencies + self.sync_responses = sync_responses async def run(self, session: AsyncSession) -> None: - agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.agencies) + # Upsert Agencies + agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.sync_responses) await sh.bulk_upsert(session=session, models=agency_upserts) + + # Add and update Meta URLs + meta_urls_query_builder = UpsertMetaUrlsQueryBuilder(self.sync_responses) + upsert_meta_urls_responses: list[AgencyURLMappings] = await meta_urls_query_builder.run(session=session) + + # Add and remove URL-Agency Links + update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(upsert_meta_urls_responses) + await update_url_links_query_builder.run(session=session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py new file mode 100644 index 00000000..f8447da4 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -0,0 +1,49 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.filter import filter_agency_meta_url_link_subsets +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.subsets import AgencyMetaURLLinkSubsets +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): + """Updates agency URL links.""" + + def __init__( + self, + responses: list[AgencyURLMappings] + ): + super().__init__() + self._new_mappings = responses + + async def run(self, session: AsyncSession) -> None: + + requester = UpdateAgencyURLLinksRequester(session) + agency_ids: list[int] = [response.agency_id for response in self._new_mappings] + old_mappings: list[AgencyURLMappings] = await requester.lookup_meta_url_agency_links(agency_ids) + + subset_list: list[AgencyMetaURLLinkSubsets] = filter_agency_meta_url_link_subsets( + new_mappings=self._new_mappings, + old_mappings=old_mappings, + ) + + links_to_add: list[LinkURLAgencyPydantic] = [] + links_to_remove: list[LinkURLAgencyPydantic] = [] + for subsets in subset_list: + agency_id: int = subsets.agency_id + for url_id in subsets.add: + links_to_add.append( + LinkURLAgencyPydantic(url_id=url_id, agency_id=agency_id) + ) + for url_id in subsets.remove: + links_to_remove.append( + LinkURLAgencyPydantic(url_id=url_id, agency_id=agency_id) + ) + + await requester.add_agency_url_links(links=links_to_add) + await requester.remove_agency_url_links(links=links_to_remove) + + + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py new file mode 100644 index 00000000..c4b23b48 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py @@ -0,0 +1,40 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.subsets import AgencyMetaURLLinkSubsets +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings + +def _convert_to_agency_id_to_url_ids(mappings: list[AgencyURLMappings]) -> dict[int, list[int]]: + agency_id_to_url_ids: dict[int, list[int]] = {} + for mapping in mappings: + agency_id_to_url_ids[mapping.agency_id] = mapping.url_ids + return agency_id_to_url_ids + + +def filter_agency_meta_url_link_subsets( + new_mappings: list[AgencyURLMappings], + old_mappings: list[AgencyURLMappings], +) -> list[AgencyMetaURLLinkSubsets]: + + agency_id_to_new_url_ids: dict[int, list[int]] = _convert_to_agency_id_to_url_ids(new_mappings) + agency_id_to_old_url_ids: dict[int, list[int]] = _convert_to_agency_id_to_url_ids(old_mappings) + + subset_list: list[AgencyMetaURLLinkSubsets] = [] + + for agency_id in agency_id_to_new_url_ids.keys(): + + new_url_ids: set[int] = set(agency_id_to_new_url_ids[agency_id]) + old_url_ids: set[int] = set(agency_id_to_old_url_ids.get(agency_id, [])) + + url_ids_to_add: list[int] = list(new_url_ids - old_url_ids) + url_ids_to_remove: list[int] = list(old_url_ids - new_url_ids) + url_ids_to_do_nothing_with: list[int] = list(old_url_ids & new_url_ids) + + subsets = AgencyMetaURLLinkSubsets( + agency_id=agency_id, + add=url_ids_to_add, + remove=url_ids_to_remove, + do_nothing=url_ids_to_do_nothing_with, + ) + subset_list.append(subsets) + + return subset_list + + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py new file mode 100644 index 00000000..6fe570d6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py @@ -0,0 +1,54 @@ +from collections import defaultdict +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class LookupMetaURLAgencyLinksQueryBuilder(QueryBuilderBase): + """Given a set of Agency IDs, return all Meta URL agency links.""" + + def __init__(self, agency_ids: list[int]): + super().__init__() + self._agency_ids = agency_ids + + async def run(self, session: AsyncSession) -> list[AgencyURLMappings]: + query = ( + select( + LinkURLAgency.url_id, + LinkURLAgency.agency_id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLAgency.url_id, + ) + .where( + LinkURLAgency.agency_id.in_(self._agency_ids), + FlagURLValidated.type == URLValidatedType.META_URL + ) + ) + db_mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + agency_id_to_url_ids: dict[int, list[int]] = defaultdict(list) + for mapping in db_mappings: + agency_id: int = mapping["agency_id"] + url_id: int = mapping["url_id"] + agency_id_to_url_ids[agency_id].append(url_id) + + result_mappings: list[AgencyURLMappings] = [] + for agency_id in agency_id_to_url_ids.keys(): + url_ids: list[int] = agency_id_to_url_ids[agency_id] + result_mapping = AgencyURLMappings( + agency_id=agency_id, + url_ids=url_ids, + ) + result_mappings.append(result_mapping) + + return result_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py new file mode 100644 index 00000000..787bc5e6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py @@ -0,0 +1,19 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup.core import LookupMetaURLAgencyLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.templates.requester import RequesterBase + +from src.db.helpers.session import session_helper as sh + +class UpdateAgencyURLLinksRequester(RequesterBase): + + async def lookup_meta_url_agency_links(self, agency_ids: list[int]) -> list[AgencyURLMappings]: + return await LookupMetaURLAgencyLinksQueryBuilder( + agency_ids=agency_ids + ).run(session=self.session) + + async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_insert(self.session, models=links) + + async def remove_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_delete(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py new file mode 100644 index 00000000..0d953b72 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + + +class AgencyMetaURLLinkSubsets(BaseModel): + agency_id: int + add: list[int] + remove: list[int] + do_nothing: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/link_agency_url/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/link_agency_url/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py similarity index 87% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py index 94ed7481..73761251 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/add/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py @@ -34,6 +34,15 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) url_ids: list[int] = await sh.bulk_insert(session, models=url_inserts, return_ids=True) + # Connect with URLs + mappings: list[URLMapping] = [ + URLMapping( + url=url, + url_id=url_id, + ) + for url, url_id in zip(self.urls, url_ids) + ] + # Add Validation Flags flag_inserts: list[FlagURLValidatedPydantic] = [] for url_id in url_ids: @@ -44,3 +53,5 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) ) await sh.bulk_insert(session, models=flag_inserts) + + return mappings diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py new file mode 100644 index 00000000..8d3e8785 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping + + +def convert_to_update_meta_urls_params( + lookups: list[MetaURLLookupResponse] +) -> list[UpdateMetaURLsParams]: + return [ + UpdateMetaURLsParams( + url_id=lookup.url_id, + validation_type=lookup.validation_type, + record_type=lookup.record_type, + ) + for lookup in lookups + ] + +def convert_url_lookups_to_url_mappings( + lookups: list[MetaURLLookupResponse] +) -> list[URLMapping]: + return [ + URLMapping( + url_id=lookup.url_id, + url=lookup.url, + ) + for lookup in lookups + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py new file mode 100644 index 00000000..74207ff1 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py @@ -0,0 +1,55 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.mapper import AgencyIDMetaURLMapper +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.db.dtos.url.mapping import URLMapping +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper + + +class UpsertMetaUrlsQueryBuilder(QueryBuilderBase): + """Add and update meta URLs for agencies.""" + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> list[AgencyURLMappings]: + + requester = UpdateMetaURLsRequester(session) + + lookup_responses: list[MetaURLLookupResponse] = \ + await requester.lookup_meta_urls(self.sync_responses) + new_url_mappings = \ + await requester.add_new_urls_to_database(lookup_responses) + existing_url_mappings = \ + await requester.update_existing_urls(lookup_responses) + + all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings + + return self._build_responses(all_url_mappings) + + + def _build_responses(self, all_url_mappings: list[URLMapping]) -> list[AgencyURLMappings]: + agency_id_mapper = AgencyIDMetaURLMapper(self.sync_responses) + url_mapper = URLMapper(all_url_mappings) + + responses: list[AgencyURLMappings] = [] + for agency_id in agency_id_mapper.get_all_ids(): + url_ids: list[int] = [] + agency_urls: list[str] = agency_id_mapper.get_urls(agency_id) + for agency_url in agency_urls: + url_ids.append(url_mapper.get_id(agency_url)) + response = AgencyURLMappings( + agency_id=agency_id, + url_ids=url_ids, + ) + responses.append(response) + + return responses + + + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py new file mode 100644 index 00000000..c05b55f1 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py @@ -0,0 +1,12 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_urls_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[str]: + url_set: set[str] = set() + for response in responses: + for url in response.meta_urls: + url_set.add(url) + + return list(url_set) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py new file mode 100644 index 00000000..a0a80732 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py @@ -0,0 +1,20 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse + + +def filter_urls_to_add( + lookup_responses: list[MetaURLLookupResponse] +) -> list[str]: + return [ + lookup_response.url + for lookup_response in lookup_responses + if not lookup_response.exists_in_db + ] + +def filter_existing_url_mappings( + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + return [ + lookup_response + for lookup_response in lookup_responses + if lookup_response.exists_in_db + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py new file mode 100644 index 00000000..82b0012a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py @@ -0,0 +1,46 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class LookupMetaURLsQueryBuilder(QueryBuilderBase): + """Lookup whether URLs exist in DB and are validated as meta URLs""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: + query = ( + select( + URL.id, + URL.url, + URL.record_type, + FlagURLValidated.type + ) + .where( + URL.url.in_(self.urls) + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + isouter=True + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + return [ + MetaURLLookupResponse( + url=mapping["url"], + url_id=mapping["id"], + record_type=mapping["record_type"], + validation_type=mapping["type"] + ) for mapping in mappings + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py similarity index 92% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py index 4e14cb53..ff2d668d 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/lookup/url/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py @@ -20,4 +20,4 @@ def is_meta_url(self) -> bool: @property def is_validated(self) -> bool: - return self.validation_type is not None \ No newline at end of file + return self.validation_type is not None diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py similarity index 77% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py index b46608d4..d5962770 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/mapper.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py @@ -13,9 +13,8 @@ def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): self._meta_url_to_agency_id[meta_url].append(sync_response.agency_id) self._agency_id_to_meta_urls[sync_response.agency_id].append(meta_url) + def get_urls(self, id_: int) -> list[str]: + return self._agency_id_to_meta_urls[id_] - def get_ids(self, url: str) -> list[int]: - return self._meta_url_to_agency_id[url] - - def get_urls(self, id: int) -> list[str]: - return self._agency_id_to_meta_urls[id] \ No newline at end of file + def get_all_ids(self) -> list[int]: + return list(self._agency_id_to_meta_urls.keys()) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py new file mode 100644 index 00000000..509b0d57 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py @@ -0,0 +1,40 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ + convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ + filter_urls_to_add +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping +from src.db.templates.requester import RequesterBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + +class UpdateMetaURLsRequester(RequesterBase): + + async def lookup_meta_urls( + self, + agencies: list[AgenciesSyncResponseInnerInfo] + ) -> list[MetaURLLookupResponse]: + urls: list[str] = extract_urls_from_agencies_sync_response(agencies) + return await LookupMetaURLsQueryBuilder(urls).run(self.session) + + async def add_new_urls_to_database(self, lookup_responses: list[MetaURLLookupResponse]) -> list[URLMapping]: + urls_to_add: list[str] = filter_urls_to_add(lookup_responses) + return await AddMetaURLsQueryBuilder(urls_to_add).run(self.session) + + async def update_existing_urls( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + existing_url_lookups: list[MetaURLLookupResponse] = ( + filter_existing_url_mappings(lookup_responses)) + params: list[UpdateMetaURLsParams] = \ + convert_to_update_meta_urls_params(existing_url_lookups) + await UpdateMetaURLsQueryBuilder(params).run(self.session) + existing_url_mappings: list[URLMapping] = \ + convert_url_lookups_to_url_mappings(existing_url_lookups) + return existing_url_mappings + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py new file mode 100644 index 00000000..0f3c9d69 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencyURLMappings(BaseModel): + agency_id: int + url_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/models/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py similarity index 79% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py index 952f87f3..1e479652 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py @@ -1,10 +1,10 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.filter import \ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.filter import \ filter_urls_with_non_meta_record_type, filter_urls_with_non_meta_url_validation_flag, \ filter_urls_without_validation_flag -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.params import UpdateMetaURLsParams -from src.core.tasks.scheduled.impl.sync.agency.queries.meta_urls.update.requester import UpdateMetaURLsRequester, \ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.requester import \ UpdateMetaURLsUpdateURLAndValidationFlagsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -37,9 +37,3 @@ async def run( urls_with_non_meta_url_validation_flag: list[int] = filter_urls_with_non_meta_url_validation_flag(self.params) await requester.update_validation_flags(urls_with_non_meta_url_validation_flag) - - - - - - raise NotImplementedError diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py new file mode 100644 index 00000000..cc5ae851 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py @@ -0,0 +1,37 @@ +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +def filter_urls_with_non_meta_record_type( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.record_type is None: + url_ids.append(param.url_id) + if param.record_type != RecordType.CONTACT_INFO_AND_AGENCY_META: + url_ids.append(param.url_id) + + return url_ids + +def filter_urls_without_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_type is None: + url_ids.append(param.url_id) + return url_ids + +def filter_urls_with_non_meta_url_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_flag is None: + continue + if param.validation_type != URLValidatedType.META_URL: + url_ids.append(param.url_id) + + return url_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/meta_urls/update/params.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py new file mode 100644 index 00000000..175b1bbf --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py @@ -0,0 +1,53 @@ +from sqlalchemy import update + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.templates.requester import RequesterBase + +from src.db.helpers.session import session_helper as sh + +class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): + + async def update_validation_flags(self, url_ids: list[int]) -> None: + """Set validation flag for URLs to Meta URL""" + query = ( + update( + FlagURLValidated + ) + .where( + FlagURLValidated.url_id.in_(url_ids) + ) + .values( + type=URLValidatedType.META_URL + ) + ) + await self.session.execute(query) + + async def add_validation_flags(self, url_ids: list[int]) -> None: + inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=URLValidatedType.META_URL, + ) + inserts.append(flag) + + await sh.bulk_insert(self.session, models=inserts) + + async def update_urls(self, url_ids: list[int]) -> None: + """Update URLs and set record type to Contact Info and Agency Meta""" + query = ( + update( + URL + ) + .values( + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + ) + .where( + URL.id.in_(url_ids) + ) + ) + await self.session.execute(query) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py index 6222d1fd..93c1cbc9 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py @@ -1,13 +1,88 @@ +from collections import defaultdict + +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import URLAgencyLinkUpdateQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyParams +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): + """Given a set of URL-Agency links, remove all non-matching non-Meta URL links and add new ones.""" + + + def __init__(self, models: list[UpdateLinkURLAgencyParams]): + super().__init__() + self.models = models + self._new_links: dict[int, list[int]] = { + model.url_id: model.new_agency_ids + for model in self.models + } + self._existing_links: dict[int, list[int]] = defaultdict(list) + self.existing_url_ids: set[int] = { + model.url_id for model in self.models + } + + async def _get_existing_links(self, session: AsyncSession) -> None: + """Get existing non-meta URL agency links for provided URL IDs. + + Modifies: + self._existing_links + """ + query = ( + select(LinkURLAgency) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLAgency.url_id, + ) + .where( + LinkURLAgency.url_id.in_( + self.existing_url_ids + ), + FlagURLValidated.type != URLValidatedType.META_URL + ) + ) + links = await session.scalars(query) + for link in links: + self._existing_links[link.url_id].append(link.agency_id) + + async def _update_links(self, session: AsyncSession) -> None: + # Remove all existing links not in new links + links_to_delete: list[LinkURLAgencyPydantic] = [] + links_to_insert: list[LinkURLAgencyPydantic] = [] + + for url_id in self.existing_url_ids: + new_agency_ids = self._new_links.get(url_id, []) + existing_agency_ids = self._existing_links.get(url_id, []) + # IDs to delete are existing agency ids that are not new agency ids + ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) + # IDs to insert are new agency ids that are not existing agency ids + ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) + + links_to_delete.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_delete) + ) + ) + links_to_insert.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_insert) + ) + ) + + await sh.bulk_delete(session=session, models=links_to_delete) + await sh.bulk_insert(session=session, models=links_to_insert) + + async def run(self, session: AsyncSession) -> None: + await self._get_existing_links(session=session) + await self._update_links(session=session) -async def update_agency_links( - session: AsyncSession, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] -) -> None: - """Overwrite existing url_agency links with new ones, if applicable.""" - query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py index d43bbbd8..6f8a14eb 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class UpdateLinkURLAgencyForDataSourcesSyncParams(BaseModel): +class UpdateLinkURLAgencyParams(BaseModel): url_id: int new_agency_ids: list[int] old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py deleted file mode 100644 index a81be905..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py +++ /dev/null @@ -1,79 +0,0 @@ -from collections import defaultdict - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.queries.base.builder import QueryBuilderBase - - -class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): - """Given a set of URL-Agency links, remove all non-matching links and add new ones.""" - - - def __init__(self, models: list[UpdateLinkURLAgencyForDataSourcesSyncParams]): - super().__init__() - self.models = models - self._new_links: dict[int, list[int]] = { - model.url_id: model.new_agency_ids - for model in self.models - } - self._existing_links: dict[int, list[int]] = defaultdict(list) - self.existing_url_ids = {model.url_id for model in self.models} - - async def _get_existing_links(self, session: AsyncSession): - """Get existing agency links for provided URLs. - - Modifies: - self._existing_links - """ - query = ( - select(LinkURLAgency) - .where( - LinkURLAgency.url_id.in_( - self.existing_url_ids - ) - ) - ) - links = await session.scalars(query) - for link in links: - self._existing_links[link.url_id].append(link.agency_id) - - async def _update_links(self, session: AsyncSession): - # Remove all existing links not in new links - links_to_delete: list[LinkURLAgencyPydantic] = [] - links_to_insert: list[LinkURLAgencyPydantic] = [] - - for url_id in self.existing_url_ids: - new_agency_ids = self._new_links.get(url_id, []) - existing_agency_ids = self._existing_links.get(url_id, []) - # IDs to delete are existing agency ids that are not new agency ids - ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) - # IDs to insert are new agency ids that are not existing agency ids - ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) - - links_to_delete.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_delete) - ) - ) - links_to_insert.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_insert) - ) - ) - - await sh.bulk_delete(session=session, models=links_to_delete) - await sh.bulk_insert(session=session, models=links_to_insert) - - async def run(self, session: AsyncSession): - await self._get_existing_links(session=session) - await self._update_links(session=session) - - diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index 5c57474d..e0a7225f 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -1,5 +1,5 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams + UpdateLinkURLAgencyParams from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import \ convert_approval_status_to_validated_type from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ @@ -61,12 +61,12 @@ def add_new_urls( def update_agency_link( self, lookup_results: list[LookupURLForDataSourcesSyncResponse] - ) -> list[UpdateLinkURLAgencyForDataSourcesSyncParams]: + ) -> list[UpdateLinkURLAgencyParams]: results = [] for lookup_result in lookup_results: url_info = lookup_result.url_info sync_info = self._mapper.get(url_info.url) - update_params = UpdateLinkURLAgencyForDataSourcesSyncParams( + update_params = UpdateLinkURLAgencyParams( url_id=url_info.url_id, new_agency_ids=sync_info.agency_ids, old_agency_ids=url_info.agency_ids diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py index e91cd229..eaae3a17 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py @@ -1,8 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import \ + UpdateLinkURLAgencyParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.core import \ URLAgencyLinkUpdateQueryBuilder from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ InsertURLForDataSourcesSyncParams @@ -72,7 +72,7 @@ async def add_new_agency_links( async def update_agency_links( self, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] + params: list[UpdateLinkURLAgencyParams] ) -> None: """Overwrite existing url_agency links with new ones, if applicable.""" query = URLAgencyLinkUpdateQueryBuilder(params) diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 290ae2bd..508ed16b 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -13,6 +13,7 @@ from src.db.helpers.session.parser import BulkActionParser from src.db.models.templates_.with_id import WithIDBase from src.db.models.templates_.base import Base +from src.db.queries.base.builder import QueryBuilderBase from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel @@ -222,4 +223,3 @@ async def bulk_update( ) await session.execute(stmt) - From b8749a45ff6bb8971f26c8217edecb76d90c374b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 30 Aug 2025 08:41:47 -0400 Subject: [PATCH 11/33] Continue draft --- pyproject.toml | 1 + .../scheduled/impl/sync/agency/operator.py | 2 +- .../impl/sync/agency/queries/upsert/core.py | 5 +- .../queries/upsert/{meta_urls => }/extract.py | 0 .../agency/queries/upsert/links/convert.py | 81 ++++++++++++++ .../sync/agency/queries/upsert/links/core.py | 63 +++++++---- .../agency/queries/upsert/links/filter.py | 48 ++------ .../agency/queries/upsert/links/lookup.py | 37 ++++++ .../queries/upsert/links/lookup/core.py | 54 --------- .../links/{lookup => models}/__init__.py | 0 .../response.py => links/models/mappings.py} | 0 .../agency/queries/upsert/links/requester.py | 9 +- .../agency/queries/upsert/links/subsets.py | 8 -- .../upsert/{meta_urls => }/lookup/__init__.py | 0 .../sync/agency/queries/upsert/lookup/core.py | 105 ++++++++++++++++++ .../agency/queries/upsert/lookup/extract.py | 10 ++ .../upsert/{meta_urls => }/lookup/response.py | 1 + .../queries/upsert/meta_urls/convert.py | 2 +- .../agency/queries/upsert/meta_urls/core.py | 40 ++----- .../agency/queries/upsert/meta_urls/filter.py | 21 +++- .../queries/upsert/meta_urls/lookup/core.py | 46 -------- .../agency/queries/upsert/meta_urls/mapper.py | 20 ---- .../queries/upsert/meta_urls/requester.py | 22 ++-- .../queries/upsert/meta_urls/update/filter.py | 2 +- src/db/client/async_.py | 10 +- src/db/helpers/session/session_helper.py | 3 +- .../models/impl/link/url_agency/pydantic.py | 4 + .../scheduled/impl/sync/agency/conftest.py | 20 +++- .../impl/sync/agency/setup/__init__.py | 0 .../scheduled/impl/sync/agency/setup/core.py | 53 +++++++++ .../sync/agency/test_ds_url_in_db_not_sync.py | 90 +++++++++++++++ .../impl/sync/agency/test_interruption.py | 3 - .../agency/test_meta_url_in_db_not_sync.py | 78 +++++++++++++ ...est_happy_path.py => test_no_meta_urls.py} | 3 + .../agency/test_same_meta_url_diff_agency.py | 77 +++++++++++++ .../test_same_meta_url_diff_val_record.py | 86 ++++++++++++++ .../test_with_meta_url_not_in_database.py | 67 +++++++++++ .../test_meta_url_not_modified.py | 88 +++++++++++++++ .../test_validated_meta_url.py | 10 ++ tests/helpers/data_creator/core.py | 75 +++++++++---- tests/helpers/data_creator/create.py | 6 +- uv.lock | 56 ++++++++++ 42 files changed, 1021 insertions(+), 285 deletions(-) rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{meta_urls => }/extract.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/{lookup => models}/__init__.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{meta_urls/response.py => links/models/mappings.py} (100%) delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{meta_urls => }/lookup/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{meta_urls => }/lookup/response.py (94%) delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py rename tests/automated/integration/tasks/scheduled/impl/sync/agency/{test_happy_path.py => test_no_meta_urls.py} (95%) create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py create mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py diff --git a/pyproject.toml b/pyproject.toml index 3eb1446d..51eca7a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "pyjwt~=2.10.1", "python-dotenv~=1.0.1", "requests~=2.32.3", + "side-effects>=1.6.dev0", "sqlalchemy~=2.0.36", "starlette~=0.45.3", "tqdm>=4.64.1", diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index 1962eaa7..6adff30b 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -31,7 +31,7 @@ async def inner_task_logic(self): count_agencies_synced = 0 request_count = 0 while len(response.agencies) > 0: - await self.adb_client.upsert_agencies(response.agencies) + await self.update_data(response.agencies) count_agencies_synced += len(response.agencies) request_count += 1 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py index dc7ba155..fc909e48 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py @@ -2,7 +2,6 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.core import UpdateAgencyURLLinksQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.core import UpsertMetaUrlsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ convert_agencies_sync_response_to_agencies_upsert from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel @@ -24,8 +23,8 @@ async def run(self, session: AsyncSession) -> None: # Add and update Meta URLs meta_urls_query_builder = UpsertMetaUrlsQueryBuilder(self.sync_responses) - upsert_meta_urls_responses: list[AgencyURLMappings] = await meta_urls_query_builder.run(session=session) + await meta_urls_query_builder.run(session=session) # Add and remove URL-Agency Links - update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(upsert_meta_urls_responses) + update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(self.sync_responses) await update_url_links_query_builder.run(session=session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/extract.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py new file mode 100644 index 00000000..7317b23b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py @@ -0,0 +1,81 @@ +from collections import defaultdict + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper + + +def _convert_lookup_response_to_url_mapping( + response: MetaURLLookupResponse +) -> URLMapping: + return URLMapping( + url_id=response.url_id, + url=response.url, + ) + +def convert_sync_and_lookup_responses_to_sync_mappings( + sync_responses: list[AgenciesSyncResponseInnerInfo], + lookup_responses: list[MetaURLLookupResponse] +) -> list[AgencyURLMappings]: + """Get all prior Agency-URL mappings. + Leveraging the lookup responses to get the URL ids + """ + + # Get the URL ids for the URLs + lookup_url_mappings: list[URLMapping] = [ + _convert_lookup_response_to_url_mapping(response) + for response in lookup_responses + ] + url_mapper = URLMapper(lookup_url_mappings) + + # Associate Agency with URLs in Sync Responses + agency_to_sync_urls: dict[int, list[str]] = {} + for response in sync_responses: + agency_to_sync_urls[response.agency_id] = response.meta_urls + + # Create Agency-URL Mappings + agency_url_mappings: list[AgencyURLMappings] = [] + for agency in agency_to_sync_urls: + url_ids: list[int] = [] + for url in agency_to_sync_urls[agency]: + url_id: int = url_mapper.get_id(url) + url_ids.append(url_id) + agency_url_mapping = AgencyURLMappings( + agency_id=agency, + url_ids=url_ids, + ) + agency_url_mappings.append(agency_url_mapping) + + return agency_url_mappings + + +def convert_lookup_responses_to_mappings( + responses: list[MetaURLLookupResponse] +) -> list[AgencyURLMappings]: + """Get all current Agency-URL mappings.""" + agency_to_url_ids: dict[int, list[int]] = defaultdict(list) + for response in responses: + for agency_id in response.agency_ids: + agency_to_url_ids[agency_id].append(response.url_id) + + agency_url_mappings: list[AgencyURLMappings] = [] + for agency_id in agency_to_url_ids: + agency_url_mappings.append(AgencyURLMappings( + agency_id=agency_id, + url_ids=agency_to_url_ids[agency_id], + )) + + return agency_url_mappings + +def convert_mappings_to_links( + mappings: list[AgencyURLMappings] +) -> set[LinkURLAgencyPydantic]: + links: set[LinkURLAgencyPydantic] = set() + for mapping in mappings: + for url_id in mapping.url_ids: + links.add(LinkURLAgencyPydantic(url_id=url_id, agency_id=mapping.agency_id)) + + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py index f8447da4..99d590a1 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -1,11 +1,18 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.filter import filter_agency_meta_url_link_subsets +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.convert import \ + convert_lookup_responses_to_mappings, convert_mappings_to_links, convert_sync_and_lookup_responses_to_sync_mappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.filter import filter_non_relevant_mappings from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.subsets import AgencyMetaURLLinkSubsets -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ + extract_agency_ids_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): @@ -13,37 +20,47 @@ class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): def __init__( self, - responses: list[AgencyURLMappings] + sync_responses: list[AgenciesSyncResponseInnerInfo] ): super().__init__() - self._new_mappings = responses + self._sync_responses = sync_responses async def run(self, session: AsyncSession) -> None: + # TODO: Replace with LookupMetaURLLinksQueryBuilder - requester = UpdateAgencyURLLinksRequester(session) - agency_ids: list[int] = [response.agency_id for response in self._new_mappings] - old_mappings: list[AgencyURLMappings] = await requester.lookup_meta_url_agency_links(agency_ids) + lookup_responses: list[MetaURLLookupResponse] = \ + await LookupMetaURLsQueryBuilder(self._sync_responses).run(session=session) + filtered_lookup_responses: list[MetaURLLookupResponse] = \ + filter_urls_in_sync(self._sync_responses, lookup_responses=lookup_responses) - subset_list: list[AgencyMetaURLLinkSubsets] = filter_agency_meta_url_link_subsets( - new_mappings=self._new_mappings, - old_mappings=old_mappings, + new_mappings: list[AgencyURLMappings] = convert_sync_and_lookup_responses_to_sync_mappings( + self._sync_responses, + lookup_responses=filtered_lookup_responses, ) + old_mappings: list[AgencyURLMappings] = self._get_old_mappings(filtered_lookup_responses) + + new_links: set[LinkURLAgencyPydantic] = convert_mappings_to_links(new_mappings) + old_links: set[LinkURLAgencyPydantic] = convert_mappings_to_links(old_mappings) - links_to_add: list[LinkURLAgencyPydantic] = [] - links_to_remove: list[LinkURLAgencyPydantic] = [] - for subsets in subset_list: - agency_id: int = subsets.agency_id - for url_id in subsets.add: - links_to_add.append( - LinkURLAgencyPydantic(url_id=url_id, agency_id=agency_id) - ) - for url_id in subsets.remove: - links_to_remove.append( - LinkURLAgencyPydantic(url_id=url_id, agency_id=agency_id) - ) + links_to_add: list[LinkURLAgencyPydantic] = list(new_links - old_links) + links_to_remove: list[LinkURLAgencyPydantic] = list(old_links - new_links) + requester = UpdateAgencyURLLinksRequester(session) await requester.add_agency_url_links(links=links_to_add) await requester.remove_agency_url_links(links=links_to_remove) + def _get_old_mappings( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[AgencyURLMappings]: + old_mappings: list[AgencyURLMappings] = convert_lookup_responses_to_mappings(lookup_responses) + relevant_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) + # Exclude old mappings that are not relevant + filtered_old_mappings: list[AgencyURLMappings] = filter_non_relevant_mappings( + mappings=old_mappings, + relevant_agency_ids=relevant_agency_ids, + ) + return filtered_old_mappings + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py index c4b23b48..123bd0ba 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py @@ -1,40 +1,12 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.subsets import AgencyMetaURLLinkSubsets -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings - -def _convert_to_agency_id_to_url_ids(mappings: list[AgencyURLMappings]) -> dict[int, list[int]]: - agency_id_to_url_ids: dict[int, list[int]] = {} - for mapping in mappings: - agency_id_to_url_ids[mapping.agency_id] = mapping.url_ids - return agency_id_to_url_ids - - -def filter_agency_meta_url_link_subsets( - new_mappings: list[AgencyURLMappings], - old_mappings: list[AgencyURLMappings], -) -> list[AgencyMetaURLLinkSubsets]: - - agency_id_to_new_url_ids: dict[int, list[int]] = _convert_to_agency_id_to_url_ids(new_mappings) - agency_id_to_old_url_ids: dict[int, list[int]] = _convert_to_agency_id_to_url_ids(old_mappings) - - subset_list: list[AgencyMetaURLLinkSubsets] = [] - - for agency_id in agency_id_to_new_url_ids.keys(): - - new_url_ids: set[int] = set(agency_id_to_new_url_ids[agency_id]) - old_url_ids: set[int] = set(agency_id_to_old_url_ids.get(agency_id, [])) - - url_ids_to_add: list[int] = list(new_url_ids - old_url_ids) - url_ids_to_remove: list[int] = list(old_url_ids - new_url_ids) - url_ids_to_do_nothing_with: list[int] = list(old_url_ids & new_url_ids) - - subsets = AgencyMetaURLLinkSubsets( - agency_id=agency_id, - add=url_ids_to_add, - remove=url_ids_to_remove, - do_nothing=url_ids_to_do_nothing_with, - ) - subset_list.append(subsets) - - return subset_list +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings +def filter_non_relevant_mappings( + mappings: list[AgencyURLMappings], + relevant_agency_ids: list[int] +) -> list[AgencyURLMappings]: + relevant_mappings: list[AgencyURLMappings] = [] + for mapping in mappings: + if mapping.agency_id in relevant_agency_ids: + relevant_mappings.append(mapping) + return relevant_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py new file mode 100644 index 00000000..281be2d9 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ + extract_agency_ids_from_agencies_sync_response +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.db.helpers.session import session_helper as sh + +class LookupMetaURLLinksQueryBuilder(QueryBuilderBase): + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(sync_responses) + + async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: + + query = ( + select( + LinkURLAgency.url_id, + LinkURLAgency.agency_id + ) + .where( + LinkURLAgency.agency_id.in_(self.agency_ids), + ) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + links: list[LinkURLAgencyPydantic] = [ + LinkURLAgencyPydantic(**mapping) for mapping in mappings + ] + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py deleted file mode 100644 index 6fe570d6..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/core.py +++ /dev/null @@ -1,54 +0,0 @@ -from collections import defaultdict -from typing import Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings -from src.db.models.impl.flag.url_validated.enums import URLValidatedType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.queries.base.builder import QueryBuilderBase - -from src.db.helpers.session import session_helper as sh - -class LookupMetaURLAgencyLinksQueryBuilder(QueryBuilderBase): - """Given a set of Agency IDs, return all Meta URL agency links.""" - - def __init__(self, agency_ids: list[int]): - super().__init__() - self._agency_ids = agency_ids - - async def run(self, session: AsyncSession) -> list[AgencyURLMappings]: - query = ( - select( - LinkURLAgency.url_id, - LinkURLAgency.agency_id, - ) - .outerjoin( - FlagURLValidated, - FlagURLValidated.url_id == LinkURLAgency.url_id, - ) - .where( - LinkURLAgency.agency_id.in_(self._agency_ids), - FlagURLValidated.type == URLValidatedType.META_URL - ) - ) - db_mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - - agency_id_to_url_ids: dict[int, list[int]] = defaultdict(list) - for mapping in db_mappings: - agency_id: int = mapping["agency_id"] - url_id: int = mapping["url_id"] - agency_id_to_url_ids[agency_id].append(url_id) - - result_mappings: list[AgencyURLMappings] = [] - for agency_id in agency_id_to_url_ids.keys(): - url_ids: list[int] = agency_id_to_url_ids[agency_id] - result_mapping = AgencyURLMappings( - agency_id=agency_id, - url_ids=url_ids, - ) - result_mappings.append(result_mapping) - - return result_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/response.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py index 787bc5e6..9786c866 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py @@ -1,17 +1,10 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup.core import LookupMetaURLAgencyLinksQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings +from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.templates.requester import RequesterBase -from src.db.helpers.session import session_helper as sh class UpdateAgencyURLLinksRequester(RequesterBase): - async def lookup_meta_url_agency_links(self, agency_ids: list[int]) -> list[AgencyURLMappings]: - return await LookupMetaURLAgencyLinksQueryBuilder( - agency_ids=agency_ids - ).run(session=self.session) - async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: await sh.bulk_insert(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py deleted file mode 100644 index 0d953b72..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/subsets.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import BaseModel - - -class AgencyMetaURLLinkSubsets(BaseModel): - agency_id: int - add: list[int] - remove: list[int] - do_nothing: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py new file mode 100644 index 00000000..c8e3d445 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py @@ -0,0 +1,105 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping, func, or_ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ + extract_agency_ids_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class LookupMetaURLsQueryBuilder(QueryBuilderBase): + """Lookup whether URLs exist in DB and are validated as meta URLs""" + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.urls: list[str] = extract_urls_from_agencies_sync_response(sync_responses) + self.agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(sync_responses) + + async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: + agency_ids_label: str = "agency_ids" + url_id_label: str = "url_id" + + cte = ( + select( + URL.id.label(url_id_label), + func.array_agg( + Agency.id, + ).label(agency_ids_label) + ) + .select_from( + URL + ) + .outerjoin( + LinkURLAgency, + LinkURLAgency.url_id == URL.id, + ) + .where( + or_( + URL.url.in_(self.urls), + LinkURLAgency.agency_id.in_(self.agency_ids) + ) + ) + .group_by( + URL.id, + ) + .cte("urls_and_agencies") + ) + + query = ( + select( + cte.c[url_id_label], + cte.c[agency_ids_label], + URL.url, + URL.record_type, + FlagURLValidated.type + ) + .select_from( + cte + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == cte.c[url_id_label], + ) + .outerjoin( + URL, + URL.id == cte.c[url_id_label], + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + urls_in_db = set() + extant_lookup_responses: list[MetaURLLookupResponse] = [] + for mapping in mappings: + url = mapping["url"] + urls_in_db.add(url) + response = MetaURLLookupResponse( + url=url, + url_id=mapping[url_id_label], + record_type=mapping["record_type"], + validation_type=mapping["type"], + agency_ids=mapping[agency_ids_label], + ) + extant_lookup_responses.append(response) + + urls_not_in_db = set(self.urls) - set(urls_in_db) + non_extant_lookup_responses = [ + MetaURLLookupResponse( + url=url, + url_id=None, + record_type=None, + validation_type=None, + agency_ids=[], + ) for url in urls_not_in_db + ] + + return extant_lookup_responses + non_extant_lookup_responses diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py new file mode 100644 index 00000000..d054f645 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py @@ -0,0 +1,10 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_agency_ids_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[int]: + return [ + response.agency_id + for response in responses + ] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py similarity index 94% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py index ff2d668d..7f77a012 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py @@ -9,6 +9,7 @@ class MetaURLLookupResponse(BaseModel): url_id: int | None record_type: RecordType | None validation_type: URLValidatedType | None + agency_ids: list[int] | None @property def exists_in_db(self) -> bool: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py index 8d3e8785..4aee9d91 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py @@ -1,4 +1,4 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams from src.db.dtos.url.mapping import URLMapping diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py index 74207ff1..16bc2a05 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py @@ -1,13 +1,10 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.mapper import AgencyIDMetaURLMapper +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.response import AgencyURLMappings -from src.db.dtos.url.mapping import URLMapping from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from src.util.url_mapper import URLMapper class UpsertMetaUrlsQueryBuilder(QueryBuilderBase): @@ -17,39 +14,16 @@ def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): super().__init__() self.sync_responses = sync_responses - async def run(self, session: AsyncSession) -> list[AgencyURLMappings]: + async def run(self, session: AsyncSession) -> None: requester = UpdateMetaURLsRequester(session) lookup_responses: list[MetaURLLookupResponse] = \ await requester.lookup_meta_urls(self.sync_responses) - new_url_mappings = \ - await requester.add_new_urls_to_database(lookup_responses) - existing_url_mappings = \ - await requester.update_existing_urls(lookup_responses) - - all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings - - return self._build_responses(all_url_mappings) - - - def _build_responses(self, all_url_mappings: list[URLMapping]) -> list[AgencyURLMappings]: - agency_id_mapper = AgencyIDMetaURLMapper(self.sync_responses) - url_mapper = URLMapper(all_url_mappings) - - responses: list[AgencyURLMappings] = [] - for agency_id in agency_id_mapper.get_all_ids(): - url_ids: list[int] = [] - agency_urls: list[str] = agency_id_mapper.get_urls(agency_id) - for agency_url in agency_urls: - url_ids.append(url_mapper.get_id(agency_url)) - response = AgencyURLMappings( - agency_id=agency_id, - url_ids=url_ids, - ) - responses.append(response) - - return responses + await requester.add_new_urls_to_database(lookup_responses) + filtered_lookup_responses: list[MetaURLLookupResponse] = \ + filter_urls_in_sync(self.sync_responses, lookup_responses=lookup_responses) + await requester.update_existing_urls(filtered_lookup_responses) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py index a0a80732..0684acf0 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py @@ -1,4 +1,6 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo def filter_urls_to_add( @@ -13,8 +15,23 @@ def filter_urls_to_add( def filter_existing_url_mappings( lookup_responses: list[MetaURLLookupResponse] ) -> list[MetaURLLookupResponse]: + """Filter only URL mappings that already exist in the database.""" return [ lookup_response for lookup_response in lookup_responses if lookup_response.exists_in_db - ] \ No newline at end of file + ] + +def filter_urls_in_sync( + sync_responses: list[AgenciesSyncResponseInnerInfo], + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + """Filter only URLs that are in sync responses.""" + sync_urls: set[str] = set( + extract_urls_from_agencies_sync_response(sync_responses) + ) + filtered_lookup_responses: list[MetaURLLookupResponse] = [] + for lookup_response in lookup_responses: + if lookup_response.url in sync_urls: + filtered_lookup_responses.append(lookup_response) + return filtered_lookup_responses \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py deleted file mode 100644 index 82b0012a..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - -from src.db.helpers.session import session_helper as sh - -class LookupMetaURLsQueryBuilder(QueryBuilderBase): - """Lookup whether URLs exist in DB and are validated as meta URLs""" - - def __init__(self, urls: list[str]): - super().__init__() - self.urls = urls - - async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: - query = ( - select( - URL.id, - URL.url, - URL.record_type, - FlagURLValidated.type - ) - .where( - URL.url.in_(self.urls) - ) - .join( - FlagURLValidated, - FlagURLValidated.url_id == URL.id, - isouter=True - ) - ) - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - - return [ - MetaURLLookupResponse( - url=mapping["url"], - url_id=mapping["id"], - record_type=mapping["record_type"], - validation_type=mapping["type"] - ) for mapping in mappings - ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py deleted file mode 100644 index d5962770..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/mapper.py +++ /dev/null @@ -1,20 +0,0 @@ -from collections import defaultdict - -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class AgencyIDMetaURLMapper: - - def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): - self._meta_url_to_agency_id: dict[str, list[int]] = defaultdict(list) - self._agency_id_to_meta_urls: dict[int, list[str]] = defaultdict(list) - for sync_response in sync_responses: - for meta_url in sync_response.meta_urls: - self._meta_url_to_agency_id[meta_url].append(sync_response.agency_id) - self._agency_id_to_meta_urls[sync_response.agency_id].append(meta_url) - - def get_urls(self, id_: int) -> list[str]: - return self._agency_id_to_meta_urls[id_] - - def get_all_ids(self) -> list[int]: - return list(self._agency_id_to_meta_urls.keys()) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py index 509b0d57..9f66f047 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py @@ -1,11 +1,10 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.extract import extract_urls_from_agencies_sync_response from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ filter_urls_to_add -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams from src.db.dtos.url.mapping import URLMapping @@ -18,11 +17,19 @@ async def lookup_meta_urls( self, agencies: list[AgenciesSyncResponseInnerInfo] ) -> list[MetaURLLookupResponse]: - urls: list[str] = extract_urls_from_agencies_sync_response(agencies) - return await LookupMetaURLsQueryBuilder(urls).run(self.session) + return await LookupMetaURLsQueryBuilder( + agencies + ).run(self.session) - async def add_new_urls_to_database(self, lookup_responses: list[MetaURLLookupResponse]) -> list[URLMapping]: + async def add_new_urls_to_database( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + if len(lookup_responses) == 0: + return [] urls_to_add: list[str] = filter_urls_to_add(lookup_responses) + if len(urls_to_add) == 0: + return [] return await AddMetaURLsQueryBuilder(urls_to_add).run(self.session) async def update_existing_urls( @@ -30,7 +37,8 @@ async def update_existing_urls( lookup_responses: list[MetaURLLookupResponse] ) -> list[URLMapping]: existing_url_lookups: list[MetaURLLookupResponse] = ( - filter_existing_url_mappings(lookup_responses)) + filter_existing_url_mappings(lookup_responses) + ) params: list[UpdateMetaURLsParams] = \ convert_to_update_meta_urls_params(existing_url_lookups) await UpdateMetaURLsQueryBuilder(params).run(self.session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py index cc5ae851..b0c32a7e 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py @@ -29,7 +29,7 @@ def filter_urls_with_non_meta_url_validation_flag( ) -> list[int]: url_ids: list[int] = [] for param in params: - if param.validation_flag is None: + if param.validation_type is None: continue if param.validation_type != URLValidatedType.META_URL: url_ids.append(param.url_id) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 5d7ffe0a..14a03f3b 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -60,7 +60,7 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert_.upsert import \ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ @@ -1255,14 +1255,6 @@ async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: GetDataSourcesSyncParametersQueryBuilder() ) - async def upsert_agencies( - self, - agencies: list[AgenciesSyncResponseInnerInfo] - ) -> None: - await self.bulk_upsert( - models=convert_agencies_sync_response_to_agencies_upsert(agencies) - ) - async def upsert_urls_from_data_sources( self, data_sources: list[DataSourcesSyncResponseInnerInfo] diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 508ed16b..aebf236f 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -11,9 +11,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session.parser import BulkActionParser -from src.db.models.templates_.with_id import WithIDBase from src.db.models.templates_.base import Base -from src.db.queries.base.builder import QueryBuilderBase +from src.db.models.templates_.with_id import WithIDBase from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel diff --git a/src/db/models/impl/link/url_agency/pydantic.py b/src/db/models/impl/link/url_agency/pydantic.py index 77522a64..fe9194de 100644 --- a/src/db/models/impl/link/url_agency/pydantic.py +++ b/src/db/models/impl/link/url_agency/pydantic.py @@ -1,3 +1,5 @@ +from pydantic import ConfigDict + from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -7,6 +9,8 @@ class LinkURLAgencyPydantic( BulkDeletableModel, BulkInsertableModel ): + model_config = ConfigDict(frozen=True) + url_id: int agency_id: int diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py index 5b0539e7..85b9f1bc 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py @@ -1,20 +1,30 @@ import pytest_asyncio from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import update_existing_agencies_updated_at, \ add_existing_agencies + +@pytest_asyncio.fixture +async def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> SyncAgenciesTaskOperator: + return SyncAgenciesTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + @pytest_asyncio.fixture async def setup( db_data_creator, - mock_pdap_client + operator ) -> SyncAgenciesTaskOperator: await add_existing_agencies(db_data_creator) await update_existing_agencies_updated_at(db_data_creator) - return SyncAgenciesTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) + return operator diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py new file mode 100644 index 00000000..cb84b014 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py @@ -0,0 +1,53 @@ +from contextlib import contextmanager +from datetime import timedelta, datetime +from unittest.mock import patch, AsyncMock + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.simple_test_data_functions import generate_test_name + + +def set_up_mock_pdap_client_responses( + mock_pdap_client: PDAPClient, + responses: list[AgenciesSyncResponseInfo | Exception] +) -> None: + """ + Modifies: + - pdap_client.sync_agencies + """ + mock_sync_agencies = AsyncMock( + side_effect=responses + [AgenciesSyncResponseInfo(agencies=[])] + ) + mock_pdap_client.sync_agencies = mock_sync_agencies + +async def set_up_urls( + db_data_creator: DBDataCreator, + record_type: RecordType, + validated_type: URLValidatedType | None = None, + agency_ids: list[int] | None = None, +) -> list[int]: + """Create 2 Test URLs in database.""" + url_ids: list[int] = await db_data_creator.create_urls(record_type=record_type, count=2) + if validated_type is not None: + await db_data_creator.create_validated_flags(url_ids=url_ids, validation_type=validated_type) + if agency_ids is not None: + await db_data_creator.create_url_agency_links(url_ids=url_ids, agency_ids=agency_ids) + return url_ids + +def set_up_sync_response_info( + agency_id: int, + meta_urls: list[str], +) -> AgenciesSyncResponseInfo: + yesterday = datetime.now() - timedelta(days=1) + return AgenciesSyncResponseInfo(agencies=[AgenciesSyncResponseInnerInfo( + agency_id=agency_id, + meta_urls=meta_urls, + updated_at=yesterday, + state_name=None, + county_name=None, + locality_name=None, + display_name=generate_test_name(agency_id) + )]) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py new file mode 100644 index 00000000..42384615 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py @@ -0,0 +1,90 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_data_sources_url_in_db_not_meta_url_sync( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + In an Agency Sync, a URL validated as a Data Source linked to the agency + should be untouched if the URL is not in the sync response. + """ + db_client: AsyncDatabaseClient = operator.adb_client + + agency_id: int = 1 + + # Create agency + await db_data_creator.create_agency(agency_id) + + # Set up sync response with new meta URL + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=agency_id, + meta_urls=[ + "https://example.com/meta-url-1", + ] + ) + + # Create additional URL Validated as data source and link to agency + ds_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.DATA_SOURCE, + record_type=RecordType.ACCIDENT_REPORTS + ))[0] + ds_url_id: int = ds_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[ds_url_id], + agency_ids=[agency_id] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 2 URLs in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 2 + assert set(url.record_type for url in urls) == { + RecordType.CONTACT_INFO_AND_AGENCY_META, + RecordType.ACCIDENT_REPORTS + } + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + assert all(link.agency_id == 1 for link in links) + assert set(link.url_id for link in links) == set(url.id for url in urls) + + # Confirm 2 Validated Flags with different Validation Types + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 2 + assert set(flag.type for flag in flags) == { + URLValidatedType.META_URL, + URLValidatedType.DATA_SOURCE + } + assert set(flag.url_id for flag in flags) == set(url.id for url in urls) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py index bf4ff81e..80b338db 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py @@ -22,15 +22,12 @@ async def test_agency_sync_interruption( operator = setup db_client = operator.adb_client - - with patch_sync_agencies( [FIRST_CALL_RESPONSE, ValueError("test error")] ): run_info = await operator.run_task() assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message - # Get current updated_ats from database for the 5 recently updated query = ( select( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py new file mode 100644 index 00000000..9db57ec7 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py @@ -0,0 +1,78 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url_in_db_not_sync( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + In an Agency Sync, a URL in the DB validated as a Meta URL linked to the agency + but not included in the most recent sync response should be removed as a link + """ + db_client: AsyncDatabaseClient = operator.adb_client + + # Create Meta URL and link to Agency + agency_id: int = 1 + await db_data_creator.create_agency(agency_id) + meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.META_URL, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META + ))[0] + meta_url_id: int = meta_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[meta_url_id], + agency_ids=[agency_id] + ) + + # Create Sync Response for agency with no Meta URLs + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=agency_id, + meta_urls=[] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 1 URL in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 1 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm no Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 0 + + # Confirm 1 Validated Flag + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 1 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert all(flag.url_id == meta_url_id for flag in flags) + + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py similarity index 95% rename from tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py rename to tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py index d783b5cb..772139f4 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py @@ -17,6 +17,9 @@ async def test_agency_sync_happy_path( wiped_database, setup: SyncAgenciesTaskOperator ): + """ + Test behavior of Agency sync where no meta URLs are returned. + """ operator = setup db_client = operator.adb_client diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py new file mode 100644 index 00000000..9a0e920b --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py @@ -0,0 +1,77 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_same_meta_url_diff_agency( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + Test that, in the case of a Meta URL already linked with one agency in the DB and + a new sync response with the same Meta URL but linked to a different agency, + the link to the original agency should be untouched while the link to the new agency + should be added. + """ + db_client: AsyncDatabaseClient = operator.adb_client + existing_agency_id: int = 1 + + await db_data_creator.create_agency(existing_agency_id) + meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.META_URL, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META + ))[0] + meta_url_id: int = meta_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[meta_url_id], + agency_ids=[existing_agency_id] + ) + + new_agency_id: int = 2 + meta_url: str = meta_url_mapping.url + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=new_agency_id, + meta_urls=[meta_url] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm two agencies in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 2 + + # Confirm 1 URL in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 1 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + + # Confirm 2 Validated Flag + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 1 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert all(flag.url_id == meta_url_id for flag in flags) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py new file mode 100644 index 00000000..f450df27 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py @@ -0,0 +1,86 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_same_meta_url_val_record( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + Test that, in the case of a Meta URL already existing in the DB + and linked to an agency but having: + - A URLValidationFlag that is not `Meta URL` + - A Record Type that is not `Contact Info and Agency Meta` + The Meta URL should have: + - The URLValidationFlag set to `Meta URL` + - The Record Type set to `Contact Info and Agency Meta` + - The link to the agency untouched + """ + db_client: AsyncDatabaseClient = operator.adb_client + + # Create agency + agency_id: int = 1 + await db_data_creator.create_agency(agency_id) + + # Create URL and link to Agency + url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.DATA_SOURCE, + record_type=RecordType.ACCIDENT_REPORTS, + ))[0] + url_id = url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_id] + ) + + # Create Sync Response + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=agency_id, + meta_urls=[] + ) + + # Run task + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 1 URL in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 1 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm 1 URLValidationFlag in database + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 1 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert all(flag.url_id == url_id for flag in flags) + + # Confirm 1 Agency-URL Link + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 1 + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py new file mode 100644 index 00000000..13a8eb20 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py @@ -0,0 +1,67 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, \ + check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_with_meta_url_not_in_database( + wiped_database, + operator: SyncAgenciesTaskOperator +): + """ + In an Agency Sync, a Meta URL included in the sync response + but not present in the DB should be added to the DB with: + - The URLValidationFlag set to `Meta URL` + - The Record Type set to `Contact Info and Agency Meta` + - The link to the agency added + """ + db_client: AsyncDatabaseClient = operator.adb_client + + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=1, + meta_urls=[ + "https://example.com/meta-url-1", + "https://example.com/meta-url-2", + ] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 2 URLs in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 2 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + assert all(link.agency_id == 1 for link in links) + assert set(link.url_id for link in links) == set(url.id for url in urls) + + # Confirm 2 Validated Flags + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert set(flag.url_id for flag in flags) == set(url.id for url in urls) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py new file mode 100644 index 00000000..51d40d6f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py @@ -0,0 +1,88 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url_not_modified( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int], + db_data_creator: DBDataCreator, +): + """ + In a Data Source Sync, a validated Meta URL linked to an agency should be untouched + if the sync response includes that same agency with other Data Sources URL + """ + original_url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + validated_type=URLValidatedType.META_URL, + ) + # Link URLs to existing agencies + await db_data_creator.create_url_agency_links( + url_ids=original_url_ids, + agency_ids=agency_ids, + ) + + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm presence of 4 URLs in database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 4 + assert all([url.status == URLStatus.OK for url in urls]) + assert set([url.record_type for url in urls]) == { + RecordType.CONTACT_INFO_AND_AGENCY_META, + RecordType.COMPLAINTS_AND_MISCONDUCT + } + all_url_ids: list[int] = [url.id for url in urls] + # Check that all original URLs are present + assert set(all_url_ids) >= set(original_url_ids) + + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 16 + assert set(link.url_id for link in links) == set(all_url_ids) + assert set(link.agency_id for link in links) == set(agency_ids) + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 4 + assert set([flag.type for flag in flags]) == { + URLValidatedType.META_URL, + URLValidatedType.DATA_SOURCE, + } + assert set(flag.url_id for flag in flags) == set(all_url_ids) + diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py new file mode 100644 index 00000000..6fd524a8 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py @@ -0,0 +1,10 @@ +import pytest + + +@pytest.mark.asyncio +async def test_validated_meta_url_not_included(): + """ + If a validated Meta URL is included in the database + This should not be included in the submit approved task + """ + raise NotImplementedError \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 93328162..a27f2c79 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -5,9 +5,12 @@ from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient @@ -39,6 +42,7 @@ from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.simple_test_data_functions import generate_test_name class DBDataCreator: @@ -264,7 +268,7 @@ async def url_miscellaneous_metadata( record_formats: Optional[list[str]] = None, data_portal_type: Optional[str] = "Test Data Portal Type", supplying_entity: Optional[str] = "Test Supplying Entity" - ): + ) -> None: if record_formats is None: record_formats = ["Test Record Format", "Test Record Format 2"] @@ -282,7 +286,11 @@ async def url_miscellaneous_metadata( await self.adb_client.add_miscellaneous_metadata([tdo]) - def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): + def duplicate_urls( + self, + duplicate_batch_id: int, + url_ids: list[int] + ) -> None: """ Create duplicates for all given url ids, and associate them with the given batch @@ -307,7 +315,7 @@ async def error_info( self, url_ids: list[int], task_id: Optional[int] = None - ): + ) -> None: if task_id is None: task_id = await self.task() error_infos = [] @@ -379,32 +387,34 @@ async def create_validated_urls( record_type: RecordType = RecordType.RESOURCES, validation_type: URLValidatedType = URLValidatedType.DATA_SOURCE, count: int = 1 - ) -> list[int]: - url_ids: list[int] = await self.create_urls( + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( record_type=record_type, count=count ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] await self.create_validated_flags( url_ids=url_ids, validation_type=validation_type ) - return url_ids + return url_mappings async def create_submitted_urls( self, record_type: RecordType = RecordType.RESOURCES, count: int = 1 - ): - url_ids: list[int] = await self.create_urls( + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( record_type=record_type, count=count ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] await self.create_validated_flags( url_ids=url_ids, validation_type=URLValidatedType.DATA_SOURCE ) await self.create_url_data_sources(url_ids=url_ids) - return url_ids + return url_mappings async def create_urls( @@ -414,28 +424,29 @@ async def create_urls( record_type: RecordType | None = RecordType.RESOURCES, count: int = 1, batch_id: int | None = None - ): + ) -> list[URLMapping]: - url_ids: list[int] = await create_urls( + url_mappings: list[URLMapping] = await create_urls( adb_client=self.adb_client, status=status, source=source, record_type=record_type, count=count ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] if batch_id is not None: await self.create_batch_url_links( url_ids=url_ids, batch_id=batch_id ) - return url_ids + return url_mappings async def create_batch( self, status: BatchStatus = BatchStatus.READY_TO_LABEL, strategy: CollectorType = CollectorType.EXAMPLE, date_generated: datetime = datetime.now(), - ): + ) -> int: return await create_batch( adb_client=self.adb_client, status=status, @@ -447,8 +458,8 @@ async def create_batch_url_links( self, url_ids: list[int], batch_id: int, - ): - return await create_batch_url_links( + ) -> None: + await create_batch_url_links( adb_client=self.adb_client, url_ids=url_ids, batch_id=batch_id @@ -458,8 +469,8 @@ async def create_validated_flags( self, url_ids: list[int], validation_type: URLValidatedType, - ): - return await create_validated_flags( + ) -> None: + await create_validated_flags( adb_client=self.adb_client, url_ids=url_ids, validation_type=validation_type @@ -468,8 +479,34 @@ async def create_validated_flags( async def create_url_data_sources( self, url_ids: list[int], - ): - return await create_url_data_sources( + ) -> None: + await create_url_data_sources( adb_client=self.adb_client, url_ids=url_ids ) + + async def create_url_agency_links( + self, + url_ids: list[int], + agency_ids: list[int], + ) -> None: + links: list[LinkURLAgency] = [] + for url_id in url_ids: + for agency_id in agency_ids: + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id, + ) + links.append(link) + await self.adb_client.add_all(links) + + async def create_agency(self, agency_id: int = 1) -> None: + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + await self.adb_client.add_all([agency]) + diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index f2bf2c97..6054c902 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus, RecordType from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic @@ -29,14 +30,15 @@ async def create_urls( source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, count: int = 1 -) -> list[int]: +) -> list[URLMapping]: urls: list[URLInsertModel] = generate_urls( status=status, source=source, record_type=record_type, count=count, ) - return await adb_client.bulk_insert(urls, return_ids=True) + url_ids = await adb_client.bulk_insert(urls, return_ids=True) + return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] async def create_validated_flags( adb_client: AsyncDatabaseClient, diff --git a/uv.lock b/uv.lock index c97b9828..067bc37f 100644 --- a/uv.lock +++ b/uv.lock @@ -151,6 +151,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/ae/9a053dd9229c0fde6b1f1f33f609ccff1ee79ddda364c756a924c6d8563b/APScheduler-3.11.0-py3-none-any.whl", hash = "sha256:fc134ca32e50f5eadcc4938e3a4545ab19131435e851abb40b34d63d5141c6da", size = 64004, upload_time = "2024-11-24T19:39:24.442Z" }, ] +[[package]] +name = "asgiref" +version = "3.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/61/0aa957eec22ff70b830b22ff91f825e70e1ef732c06666a805730f28b36b/asgiref-3.9.1.tar.gz", hash = "sha256:a5ab6582236218e5ef1648f242fd9f10626cfd4de8dc377db215d5d5098e3142", size = 36870, upload_time = "2025-07-08T09:07:43.344Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/3c/0464dcada90d5da0e71018c04a140ad6349558afb30b3051b4264cc5b965/asgiref-3.9.1-py3-none-any.whl", hash = "sha256:f3bba7092a48005b5f5bacd747d36ee4a5a61f4a269a6df590b43144355ebd2c", size = 23790, upload_time = "2025-07-08T09:07:41.548Z" }, +] + [[package]] name = "asyncpg" version = "0.30.0" @@ -417,6 +426,7 @@ dependencies = [ { name = "pyjwt" }, { name = "python-dotenv" }, { name = "requests" }, + { name = "side-effects" }, { name = "sqlalchemy" }, { name = "starlette" }, { name = "tqdm" }, @@ -465,6 +475,7 @@ requires-dist = [ { name = "pyjwt", specifier = "~=2.10.1" }, { name = "python-dotenv", specifier = "~=1.0.1" }, { name = "requests", specifier = "~=2.32.3" }, + { name = "side-effects", specifier = ">=1.6.dev0" }, { name = "sqlalchemy", specifier = "~=2.0.36" }, { name = "starlette", specifier = "~=0.45.3" }, { name = "tqdm", specifier = ">=4.64.1" }, @@ -551,6 +562,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload_time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "django" +version = "3.2.25" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asgiref" }, + { name = "pytz" }, + { name = "sqlparse" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/68/0e744f07b57bfdf99abbb6b3eb14fcba188867021c05f4a104e04f6d56b8/Django-3.2.25.tar.gz", hash = "sha256:7ca38a78654aee72378594d63e51636c04b8e28574f5505dff630895b5472777", size = 9836336, upload_time = "2024-03-04T08:57:02.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/30/8e/cc23c762c5dcd1d367d73cf006a326e0df2bd0e785cba18b658b39904c1e/Django-3.2.25-py3-none-any.whl", hash = "sha256:a52ea7fcf280b16f7b739cec38fa6d3f8953a5456986944c3ca97e79882b4e38", size = 7890550, upload_time = "2024-03-04T08:56:47.529Z" }, +] + [[package]] name = "dnspython" version = "2.7.0" @@ -1897,6 +1922,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload_time = "2024-01-23T06:32:58.246Z" }, ] +[[package]] +name = "python-env-utils" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/57/96/c49c675b9a8cfb79b7377bb5e357feafb810dd2831201cde4e499c0a5e52/python-env-utils-0.4.1.tar.gz", hash = "sha256:6357d9ae024e5039158ce337bafeca662453f41cd7789a4517217c1a9093ce57", size = 5711, upload_time = "2017-04-09T18:43:59.347Z" } + [[package]] name = "python-multipart" version = "0.0.20" @@ -2050,6 +2084,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload_time = "2023-10-24T04:13:38.866Z" }, ] +[[package]] +name = "side-effects" +version = "1.6.dev0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "django" }, + { name = "python-env-utils" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/39/c7feca6a6154195b135a4539802bc3c909b931e296c868d6974ff0c9d819/side-effects-1.6.dev0.tar.gz", hash = "sha256:9d069359fc46dbcb78938ca4a7c1e6266db84de0cdf5fc2d8ce664bfe5cae255", size = 16186, upload_time = "2020-01-01T21:29:09.983Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/24/a6def6872e165cc8d3846e5b9c2615f6f566c424d5eb6d99a15eaad7c558/side_effects-1.6.dev0-py3-none-any.whl", hash = "sha256:343f8f34de51f477238e03b0c33d79a5ef31604991a44c187ebfce0fae628c97", size = 13563, upload_time = "2020-01-01T21:29:13.045Z" }, +] + [[package]] name = "simplejson" version = "3.20.1" @@ -2162,6 +2209,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/7c/5fc8e802e7506fe8b55a03a2e1dab156eae205c91bee46305755e086d2e2/sqlalchemy-2.0.40-py3-none-any.whl", hash = "sha256:32587e2e1e359276957e6fe5dad089758bc042a971a8a09ae8ecf7a8fe23d07a", size = 1903894, upload_time = "2025-03-27T18:40:43.796Z" }, ] +[[package]] +name = "sqlparse" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/40/edede8dd6977b0d3da179a342c198ed100dd2aba4be081861ee5911e4da4/sqlparse-0.5.3.tar.gz", hash = "sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272", size = 84999, upload_time = "2024-12-10T12:05:30.728Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload_time = "2024-12-10T12:05:27.824Z" }, +] + [[package]] name = "starlette" version = "0.45.3" From 7ae95c9d0bd30da00ee13de50f48573c0d84e8e4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 30 Aug 2025 08:46:18 -0400 Subject: [PATCH 12/33] Continue draft --- .../impl/sync/agency/queries/upsert/links/lookup.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py index 281be2d9..09377bdd 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py @@ -6,8 +6,12 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ extract_agency_ids_from_agencies_sync_response +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.db.helpers.session import session_helper as sh @@ -25,7 +29,16 @@ async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: LinkURLAgency.url_id, LinkURLAgency.agency_id ) + .join( + URL, + LinkURLAgency.url_id == URL.id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) .where( + FlagURLValidated.type == URLValidatedType.META_URL, LinkURLAgency.agency_id.in_(self.agency_ids), ) ) From 8bbefe5d8c9f0f54e13598d2630e62d3e8ec0d59 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 30 Aug 2025 08:56:34 -0400 Subject: [PATCH 13/33] Continue draft --- .../scheduled/impl/sync/agency/queries/upsert/links/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py index 99d590a1..2c5b4433 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -27,6 +27,7 @@ def __init__( async def run(self, session: AsyncSession) -> None: # TODO: Replace with LookupMetaURLLinksQueryBuilder + # TODO: Include a Lookup for the URL Mappings of the sync URLs lookup_responses: list[MetaURLLookupResponse] = \ await LookupMetaURLsQueryBuilder(self._sync_responses).run(session=session) From 0c760e236a3ad0ba9471ab65745e7b2e87a6fdf9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 30 Aug 2025 16:17:31 -0400 Subject: [PATCH 14/33] Finish automated tests --- ...65a1431_augment_auto_agency_suggestions.py | 4 + .../sync/agency/queries/upsert/links/build.py | 23 +++++ .../agency/queries/upsert/links/convert.py | 81 ----------------- .../sync/agency/queries/upsert/links/core.py | 59 +++++-------- .../{lookup => links/lookup_}/__init__.py | 0 .../links/{lookup.py => lookup_/links.py} | 12 +-- .../queries/upsert/links/lookup_/url.py | 31 +++++++ .../agency/queries/upsert/links/requester.py | 9 ++ .../queries/upsert/meta_urls/convert.py | 2 +- .../agency/queries/upsert/meta_urls/core.py | 8 +- .../agency/queries/upsert/meta_urls/filter.py | 2 +- .../upsert/meta_urls/lookup/__init__.py | 0 .../upsert/{ => meta_urls}/lookup/core.py | 59 +++---------- .../upsert/{ => meta_urls}/lookup/extract.py | 0 .../upsert/{ => meta_urls}/lookup/response.py | 1 - .../queries/upsert/meta_urls/requester.py | 10 +-- src/core/tasks/scheduled/manager.py | 7 +- src/core/tasks/scheduled/registry/core.py | 24 +++++- src/core/tasks/scheduled/registry/format.py | 7 ++ .../test_same_meta_url_diff_val_record.py | 86 ------------------- 20 files changed, 147 insertions(+), 278 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py delete mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{lookup => links/lookup_}/__init__.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/{lookup.py => lookup_/links.py} (72%) create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py create mode 100644 src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{ => meta_urls}/lookup/core.py (50%) rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{ => meta_urls}/lookup/extract.py (100%) rename src/core/tasks/scheduled/impl/sync/agency/queries/upsert/{ => meta_urls}/lookup/response.py (94%) create mode 100644 src/core/tasks/scheduled/registry/format.py delete mode 100644 tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py index 84db9b19..135a04c5 100644 --- a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -53,6 +53,7 @@ def upgrade() -> None: _create_flag_url_validated_table() _add_urls_to_flag_url_validated_table() _remove_validated_and_submitted_url_statuses() + _reset_agencies_sync_state() def downgrade() -> None: @@ -64,6 +65,9 @@ def downgrade() -> None: op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME) _drop_validated_url_type_enum() +def _reset_agencies_sync_state(): + op.execute("DELETE FROM agencies_sync_state") + def _remove_validated_and_submitted_url_statuses(): switch_enum_type( table_name="urls", diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py new file mode 100644 index 00000000..5511ea65 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py @@ -0,0 +1,23 @@ +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper + +def build_links_from_url_mappings_and_sync_responses( + url_mappings: list[URLMapping], + sync_responses: list[AgenciesSyncResponseInnerInfo], +) -> list[LinkURLAgencyPydantic]: + + links: list[LinkURLAgencyPydantic] = [] + + mapper = URLMapper(url_mappings) + for sync_response in sync_responses: + agency_id: int = sync_response.agency_id + for meta_url in sync_response.meta_urls: + url_id: int = mapper.get_id(meta_url) + link = LinkURLAgencyPydantic( + agency_id=agency_id, + url_id=url_id + ) + links.append(link) + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py deleted file mode 100644 index 7317b23b..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/convert.py +++ /dev/null @@ -1,81 +0,0 @@ -from collections import defaultdict - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from src.util.url_mapper import URLMapper - - -def _convert_lookup_response_to_url_mapping( - response: MetaURLLookupResponse -) -> URLMapping: - return URLMapping( - url_id=response.url_id, - url=response.url, - ) - -def convert_sync_and_lookup_responses_to_sync_mappings( - sync_responses: list[AgenciesSyncResponseInnerInfo], - lookup_responses: list[MetaURLLookupResponse] -) -> list[AgencyURLMappings]: - """Get all prior Agency-URL mappings. - Leveraging the lookup responses to get the URL ids - """ - - # Get the URL ids for the URLs - lookup_url_mappings: list[URLMapping] = [ - _convert_lookup_response_to_url_mapping(response) - for response in lookup_responses - ] - url_mapper = URLMapper(lookup_url_mappings) - - # Associate Agency with URLs in Sync Responses - agency_to_sync_urls: dict[int, list[str]] = {} - for response in sync_responses: - agency_to_sync_urls[response.agency_id] = response.meta_urls - - # Create Agency-URL Mappings - agency_url_mappings: list[AgencyURLMappings] = [] - for agency in agency_to_sync_urls: - url_ids: list[int] = [] - for url in agency_to_sync_urls[agency]: - url_id: int = url_mapper.get_id(url) - url_ids.append(url_id) - agency_url_mapping = AgencyURLMappings( - agency_id=agency, - url_ids=url_ids, - ) - agency_url_mappings.append(agency_url_mapping) - - return agency_url_mappings - - -def convert_lookup_responses_to_mappings( - responses: list[MetaURLLookupResponse] -) -> list[AgencyURLMappings]: - """Get all current Agency-URL mappings.""" - agency_to_url_ids: dict[int, list[int]] = defaultdict(list) - for response in responses: - for agency_id in response.agency_ids: - agency_to_url_ids[agency_id].append(response.url_id) - - agency_url_mappings: list[AgencyURLMappings] = [] - for agency_id in agency_to_url_ids: - agency_url_mappings.append(AgencyURLMappings( - agency_id=agency_id, - url_ids=agency_to_url_ids[agency_id], - )) - - return agency_url_mappings - -def convert_mappings_to_links( - mappings: list[AgencyURLMappings] -) -> set[LinkURLAgencyPydantic]: - links: set[LinkURLAgencyPydantic] = set() - for mapping in mappings: - for url_id in mapping.url_ids: - links.add(LinkURLAgencyPydantic(url_id=url_id, agency_id=mapping.agency_id)) - - return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py index 2c5b4433..37d63a03 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -1,15 +1,12 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.convert import \ - convert_lookup_responses_to_mappings, convert_mappings_to_links, convert_sync_and_lookup_responses_to_sync_mappings -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.filter import filter_non_relevant_mappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.build import \ + build_links_from_url_mappings_and_sync_responses from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.core import LookupMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.extract import \ extract_agency_ids_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync +from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -26,42 +23,28 @@ def __init__( self._sync_responses = sync_responses async def run(self, session: AsyncSession) -> None: - # TODO: Replace with LookupMetaURLLinksQueryBuilder - # TODO: Include a Lookup for the URL Mappings of the sync URLs + # Get all existing links + requester = UpdateAgencyURLLinksRequester(session) - lookup_responses: list[MetaURLLookupResponse] = \ - await LookupMetaURLsQueryBuilder(self._sync_responses).run(session=session) - filtered_lookup_responses: list[MetaURLLookupResponse] = \ - filter_urls_in_sync(self._sync_responses, lookup_responses=lookup_responses) + # Build new links from sync responses and URL mappings + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self._sync_responses) + url_mappings: list[URLMapping] = await requester.get_url_mappings(urls=sync_urls) + new_links: list[LinkURLAgencyPydantic] = build_links_from_url_mappings_and_sync_responses( + url_mappings=url_mappings, + sync_responses=self._sync_responses, + ) - new_mappings: list[AgencyURLMappings] = convert_sync_and_lookup_responses_to_sync_mappings( - self._sync_responses, - lookup_responses=filtered_lookup_responses, + sync_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) + old_links: list[LinkURLAgencyPydantic] = await requester.get_current_agency_url_links( + agency_ids=sync_agency_ids, ) - old_mappings: list[AgencyURLMappings] = self._get_old_mappings(filtered_lookup_responses) - new_links: set[LinkURLAgencyPydantic] = convert_mappings_to_links(new_mappings) - old_links: set[LinkURLAgencyPydantic] = convert_mappings_to_links(old_mappings) + new_set: set[LinkURLAgencyPydantic] = set(new_links) + old_set: set[LinkURLAgencyPydantic] = set(old_links) - links_to_add: list[LinkURLAgencyPydantic] = list(new_links - old_links) - links_to_remove: list[LinkURLAgencyPydantic] = list(old_links - new_links) + links_to_add: list[LinkURLAgencyPydantic] = list(new_set - old_set) + links_to_remove: list[LinkURLAgencyPydantic] = list(old_set - new_set) - requester = UpdateAgencyURLLinksRequester(session) await requester.add_agency_url_links(links=links_to_add) await requester.remove_agency_url_links(links=links_to_remove) - def _get_old_mappings( - self, - lookup_responses: list[MetaURLLookupResponse] - ) -> list[AgencyURLMappings]: - old_mappings: list[AgencyURLMappings] = convert_lookup_responses_to_mappings(lookup_responses) - relevant_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) - # Exclude old mappings that are not relevant - filtered_old_mappings: list[AgencyURLMappings] = filter_non_relevant_mappings( - mappings=old_mappings, - relevant_agency_ids=relevant_agency_ids, - ) - return filtered_old_mappings - - - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/__init__.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py similarity index 72% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py index 09377bdd..9336deaa 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py @@ -3,24 +3,20 @@ from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ - extract_agency_ids_from_agencies_sync_response -from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from src.db.helpers.session import session_helper as sh + class LookupMetaURLLinksQueryBuilder(QueryBuilderBase): - def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + def __init__(self, agency_ids: list[int]): super().__init__() - self.agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(sync_responses) + self.agency_ids: list[int] = agency_ids async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py new file mode 100644 index 00000000..8b526447 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py @@ -0,0 +1,31 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class LookupURLQueryBuilder(QueryBuilderBase): + + def __init__(self, urls: list[str]): + super().__init__() + self.urls: list[str] = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + select( + URL.id.label("url_id"), + URL.url, + ) + .where( + URL.url.in_(self.urls), + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + urls: list[URLMapping] = [ + URLMapping(**mapping) for mapping in mappings + ] + return urls \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py index 9786c866..96887dfa 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py @@ -1,3 +1,6 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.links import LookupMetaURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.url import LookupURLQueryBuilder +from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.templates.requester import RequesterBase @@ -5,6 +8,12 @@ class UpdateAgencyURLLinksRequester(RequesterBase): + async def get_url_mappings(self, urls: list[str]) -> list[URLMapping]: + return await LookupURLQueryBuilder(urls=urls).run(session=self.session) + + async def get_current_agency_url_links(self, agency_ids: list[int]) -> list[LinkURLAgencyPydantic]: + return await LookupMetaURLLinksQueryBuilder(agency_ids=agency_ids).run(session=self.session) + async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: await sh.bulk_insert(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py index 4aee9d91..8d3e8785 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py @@ -1,4 +1,4 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams from src.db.dtos.url.mapping import URLMapping diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py index 16bc2a05..6f5c3593 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py @@ -1,7 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -17,10 +18,13 @@ def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): async def run(self, session: AsyncSession) -> None: requester = UpdateMetaURLsRequester(session) + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self.sync_responses) + lookup_responses: list[MetaURLLookupResponse] = \ - await requester.lookup_meta_urls(self.sync_responses) + await requester.lookup_meta_urls(sync_urls) await requester.add_new_urls_to_database(lookup_responses) + filtered_lookup_responses: list[MetaURLLookupResponse] = \ filter_urls_in_sync(self.sync_responses, lookup_responses=lookup_responses) await requester.update_existing_urls(filtered_lookup_responses) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py index 0684acf0..227f0edc 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py @@ -1,5 +1,5 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py similarity index 50% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py index c8e3d445..8a817bd4 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/core.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py @@ -1,78 +1,41 @@ from typing import Sequence -from sqlalchemy import select, RowMapping, func, or_ +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.extract import \ - extract_agency_ids_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse -from src.db.models.impl.agency.sqlalchemy import Agency +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - class LookupMetaURLsQueryBuilder(QueryBuilderBase): """Lookup whether URLs exist in DB and are validated as meta URLs""" - def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + def __init__(self, urls: list[str]): super().__init__() - self.urls: list[str] = extract_urls_from_agencies_sync_response(sync_responses) - self.agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(sync_responses) + self.urls: list[str] = urls async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: - agency_ids_label: str = "agency_ids" url_id_label: str = "url_id" - cte = ( - select( - URL.id.label(url_id_label), - func.array_agg( - Agency.id, - ).label(agency_ids_label) - ) - .select_from( - URL - ) - .outerjoin( - LinkURLAgency, - LinkURLAgency.url_id == URL.id, - ) - .where( - or_( - URL.url.in_(self.urls), - LinkURLAgency.agency_id.in_(self.agency_ids) - ) - ) - .group_by( - URL.id, - ) - .cte("urls_and_agencies") - ) - query = ( select( - cte.c[url_id_label], - cte.c[agency_ids_label], + URL.id.label(url_id_label), URL.url, URL.record_type, FlagURLValidated.type ) .select_from( - cte + URL ) .outerjoin( FlagURLValidated, - FlagURLValidated.url_id == cte.c[url_id_label], + FlagURLValidated.url_id == URL.id, ) - .outerjoin( - URL, - URL.id == cte.c[url_id_label], + .where( + URL.url.in_(self.urls) ) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) @@ -87,7 +50,6 @@ async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: url_id=mapping[url_id_label], record_type=mapping["record_type"], validation_type=mapping["type"], - agency_ids=mapping[agency_ids_label], ) extant_lookup_responses.append(response) @@ -98,7 +60,6 @@ async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: url_id=None, record_type=None, validation_type=None, - agency_ids=[], ) for url in urls_not_in_db ] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/extract.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py similarity index 94% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py index 7f77a012..ff2d668d 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/lookup/response.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py @@ -9,7 +9,6 @@ class MetaURLLookupResponse(BaseModel): url_id: int | None record_type: RecordType | None validation_type: URLValidatedType | None - agency_ids: list[int] | None @property def exists_in_db(self) -> bool: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py index 9f66f047..0a3e3c76 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py @@ -1,24 +1,24 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.core import LookupMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ filter_urls_to_add +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams from src.db.dtos.url.mapping import URLMapping from src.db.templates.requester import RequesterBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + class UpdateMetaURLsRequester(RequesterBase): async def lookup_meta_urls( self, - agencies: list[AgenciesSyncResponseInnerInfo] + urls: list[str] ) -> list[MetaURLLookupResponse]: return await LookupMetaURLsQueryBuilder( - agencies + urls ).run(self.session) async def add_new_urls_to_database( diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index e97e0f8e..86dfff70 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -25,13 +25,13 @@ def __init__( self._loader = loader self._registry = registry - # Main objects - self.scheduler = AsyncIOScheduler() - async def setup(self): self._registry.start_scheduler() await self.add_scheduled_tasks() + await self._registry.report_next_scheduled_task() + + async def add_scheduled_tasks(self): """ @@ -68,3 +68,4 @@ async def run_task(self, operator: ScheduledTaskOperatorBase): operator: ScheduledTaskOperatorBase raise Exception(f"Task {operator.task_type.value} has not been linked to any URLs but is designated as a link task") await self._handler.handle_outcome(run_info) + await self._registry.report_next_scheduled_task() diff --git a/src/core/tasks/scheduled/registry/core.py b/src/core/tasks/scheduled/registry/core.py index a1928504..a622346c 100644 --- a/src/core/tasks/scheduled/registry/core.py +++ b/src/core/tasks/scheduled/registry/core.py @@ -6,6 +6,7 @@ from apscheduler.triggers.interval import IntervalTrigger from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry +from src.core.tasks.scheduled.registry.format import format_job_datetime from src.db.enums import TaskType @@ -29,8 +30,9 @@ async def add_job( Modifies: self._jobs """ - self._jobs[entry.operator.task_type] = self.scheduler.add_job( - func, + job: Job = self.scheduler.add_job( + id=entry.operator.task_type.value, + func=func, trigger=IntervalTrigger( minutes=entry.interval.value, start_date=datetime.now() + timedelta(minutes=minute_lag) @@ -38,6 +40,10 @@ async def add_job( misfire_grace_time=60, kwargs={"operator": entry.operator} ) + run_time_str: str = format_job_datetime(job.next_run_time) + print(f"Adding {job.id} task to scheduler. " + + f"First run at {run_time_str}") + self._jobs[entry.operator.task_type] = job def start_scheduler(self) -> None: """ @@ -48,4 +54,16 @@ def start_scheduler(self) -> None: def shutdown_scheduler(self) -> None: if self.scheduler.running: - self.scheduler.shutdown() \ No newline at end of file + self.scheduler.shutdown() + + async def report_next_scheduled_task(self): + jobs: list[Job] = self.scheduler.get_jobs() + if len(jobs) == 0: + print("No scheduled tasks found.") + return + + jobs_sorted: list[Job] = sorted(jobs, key=lambda job: job.next_run_time) + next_job: Job = jobs_sorted[0] + + run_time_str: str = format_job_datetime(next_job.next_run_time) + print(f"Next scheduled task: {run_time_str} ({next_job.id})") \ No newline at end of file diff --git a/src/core/tasks/scheduled/registry/format.py b/src/core/tasks/scheduled/registry/format.py new file mode 100644 index 00000000..23eea364 --- /dev/null +++ b/src/core/tasks/scheduled/registry/format.py @@ -0,0 +1,7 @@ +from datetime import datetime + +def format_job_datetime(dt: datetime) -> str: + date_str: str = dt.strftime("%Y-%m-%d") + format_24: str = dt.strftime("%H:%M:%S") + format_12: str = dt.strftime("%I:%M:%S %p") + return f"{date_str} {format_24} ({format_12})" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py deleted file mode 100644 index f450df27..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_val_record.py +++ /dev/null @@ -1,86 +0,0 @@ -import pytest - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLValidatedType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ - set_up_mock_pdap_client_responses -from tests.helpers.asserts import assert_task_run_success -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_same_meta_url_val_record( - wiped_database, - operator: SyncAgenciesTaskOperator, - db_data_creator: DBDataCreator -): - """ - Test that, in the case of a Meta URL already existing in the DB - and linked to an agency but having: - - A URLValidationFlag that is not `Meta URL` - - A Record Type that is not `Contact Info and Agency Meta` - The Meta URL should have: - - The URLValidationFlag set to `Meta URL` - - The Record Type set to `Contact Info and Agency Meta` - - The link to the agency untouched - """ - db_client: AsyncDatabaseClient = operator.adb_client - - # Create agency - agency_id: int = 1 - await db_data_creator.create_agency(agency_id) - - # Create URL and link to Agency - url_mapping: URLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLValidatedType.DATA_SOURCE, - record_type=RecordType.ACCIDENT_REPORTS, - ))[0] - url_id = url_mapping.url_id - await db_data_creator.create_url_agency_links( - url_ids=[url_id], - agency_ids=[agency_id] - ) - - # Create Sync Response - sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( - agency_id=agency_id, - meta_urls=[] - ) - - # Run task - set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - await check_sync_concluded(db_client) - - # Confirm one agency in the database - agencies: list[Agency] = await db_client.get_all(Agency) - assert len(agencies) == 1 - - # Confirm 1 URL in database - urls: list[URL] = await db_client.get_all(URL) - assert len(urls) == 1 - assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) - - # Confirm 1 URLValidationFlag in database - flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) - assert len(flags) == 1 - assert all(flag.type == URLValidatedType.META_URL for flag in flags) - assert all(flag.url_id == url_id for flag in flags) - - # Confirm 1 Agency-URL Link - links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) - assert len(links) == 1 - - From 01f7a5025028448d3672a698f37ee251619ba732 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 1 Sep 2025 10:31:29 -0400 Subject: [PATCH 15/33] Update draft --- ...d5aa7670ff_remove_functional_duplicates.py | 2 +- ...65a1431_augment_auto_agency_suggestions.py | 14 +- ...aee0dd79_overhaul_agency_identification.py | 250 ++++++++++++++++++ ...daf0_revise_agency_identification_logic.py | 2 +- src/core/tasks/scheduled/loader.py | 16 +- src/core/tasks/url/manager.py | 3 +- .../operators/agency_identification/core.py | 2 +- .../subtasks/impl/base.py | 16 -- .../subtasks/impl/ckan.py | 2 +- .../subtasks/impl/homepage_match.py | 15 ++ .../subtasks/impl/muckrock.py | 2 +- .../subtasks/impl/nlp_location_match.py | 0 .../subtasks/impl/unknown.py | 2 +- .../agency_identification/subtasks/loader.py | 2 +- .../subtasks/models/__init__.py | 0 .../subtasks/models/run_info.py | 9 + .../subtasks/templates/__init__.py | 0 .../subtasks/templates/output.py | 5 + .../subtasks/templates/postprocessor.py | 26 ++ .../subtasks/templates/subtask.py | 29 ++ .../url/suggestion/agency/link/__init__.py | 0 .../url/suggestion/agency/link/pydantic.py | 11 + .../url/suggestion/agency/link/sqlalchemy.py | 24 ++ .../url/suggestion/agency/subtask/__init__.py | 0 .../url/suggestion/agency/subtask/enum.py | 19 ++ .../url/suggestion/agency/subtask/pydantic.py | 15 ++ .../suggestion/agency/subtask/sqlalchemy.py | 27 ++ src/external/pdap/client.py | 13 +- src/util/alembic_helpers.py | 12 + 29 files changed, 476 insertions(+), 42 deletions(-) create mode 100644 alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py create mode 100644 src/db/models/impl/url/suggestion/agency/link/__init__.py create mode 100644 src/db/models/impl/url/suggestion/agency/link/pydantic.py create mode 100644 src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py create mode 100644 src/db/models/impl/url/suggestion/agency/subtask/__init__.py create mode 100644 src/db/models/impl/url/suggestion/agency/subtask/enum.py create mode 100644 src/db/models/impl/url/suggestion/agency/subtask/pydantic.py create mode 100644 src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py diff --git a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py index 846329ca..201d2448 100644 --- a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py +++ b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py @@ -52,7 +52,7 @@ def downgrade() -> None: _remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME) - _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) + # _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) def _delete_duplicate_urls() -> None: op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)') diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py index 135a04c5..de3069e2 100644 --- a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -29,7 +29,6 @@ "nlp_location_match", "muckrock_match", "ckan_match", - "unknown", name="agency_auto_suggestion_method", ) @@ -66,7 +65,15 @@ def downgrade() -> None: _drop_validated_url_type_enum() def _reset_agencies_sync_state(): - op.execute("DELETE FROM agencies_sync_state") + op.execute( + """ + UPDATE agencies_sync_state + set + last_full_sync_at = null, + current_cutoff_date = null, + current_page = null + """ + ) def _remove_validated_and_submitted_url_statuses(): switch_enum_type( @@ -201,8 +208,7 @@ def _alter_auto_agency_suggestions_table(): sa.Column( 'method', AGENCY_AUTO_SUGGESTION_METHOD_ENUM, - server_default="unknown", - nullable=False + nullable=True ) ) # Confidence diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py new file mode 100644 index 00000000..89f3e750 --- /dev/null +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -0,0 +1,250 @@ +"""Overhaul agency identification + +Revision ID: 70baaee0dd79 +Revises: b741b65a1431 +Create Date: 2025-08-31 19:30:20.690369 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, url_id_column, created_at_column, agency_id_column, updated_at_column + +# revision identifiers, used by Alembic. +revision: str = '70baaee0dd79' +down_revision: Union[str, None] = 'b741b65a1431' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME: str = "url_has_agency_suggestions_view" +URL_UNKNOWN_AGENCIES_VIEW_NAME: str = "url_unknown_agencies_view" + +URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_subtask" +LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "link_agency_id_subtask_agencies" + +URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.dialects.postgresql.ENUM( + name="agency_auto_suggestion_method", + create_type=False +) + +SUBTASK_DETAIL_CODE_ENUM = sa.Enum( + 'no details', + 'blacklist-ckan-no ckan collector', + 'blacklist-muckrock-no muckrock collector', + 'blacklist-nlp-no html', + 'blacklist-homepage-root url', + 'blacklist-homepage-no meta urls associated with root', + 'case-homepage-single agency', + 'case-homepage-no data sources', + 'case-homepage-multi agency nonzero data sources', + name="agency_id_subtask_detail_code", +) + + +def upgrade() -> None: + _create_url_auto_agency_subtask_table() + _create_url_unknown_agencies_view() + _create_link_agency_id_subtask_agencies_table() + _create_url_has_agency_suggestions_view() + _create_new_url_annotation_flags_view() + _drop_url_auto_agency_suggestions_table() + + +def downgrade() -> None: + _drop_url_unknown_agencies_view() + _create_url_auto_agency_suggestions_table() + _create_old_url_annotation_flags_view() + _drop_url_has_agency_suggestions_view() + _drop_link_agency_id_subtask_agencies_table() + _drop_url_auto_agency_subtask_table() + SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) + + +def _drop_url_auto_agency_suggestions_table(): + op.drop_table(URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME) + + +def _create_new_url_annotation_flags_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + auas.has_agency_suggestions AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN lua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency lua on u.id = lua.url_id + ) + """ + ) + + +def _create_url_has_agency_suggestions_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW {URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME} AS + SELECT + u.id as url_id, + (uas.id IS NOT NULL) AS has_agency_suggestions + FROM public.urls u + LEFT JOIN public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} uas on u.id = uas.url_id + """ + ) + pass + + +def _create_url_unknown_agencies_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW {URL_UNKNOWN_AGENCIES_VIEW_NAME} AS + SELECT + u.id + FROM urls u + LEFT JOIN {URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} uas ON u.id = uas.url_id + GROUP BY u.id + HAVING bool_or(uas.agencies_found) = false + """ + ) + + +def _create_url_auto_agency_subtask_table(): + op.create_table( + URL_AUTO_AGENCY_SUBTASK_TABLE_NAME, + id_column(), + url_id_column(), + sa.Column( + "subtask", + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=False + ), + sa.Column( + "agencies_found", + sa.Boolean(), + nullable=False + ), + sa.Column( + "detail", + SUBTASK_DETAIL_CODE_ENUM, + nullable=True + ), + created_at_column() + ) + + +def _create_link_agency_id_subtask_agencies_table(): + op.create_table( + LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME, + sa.Column( + "subtask_id", + sa.Integer(), + sa.ForeignKey( + f'{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME}.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `url_auto_agency_subtask` table.' + ), + sa.Column( + "confidence", + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ), + agency_id_column(), + created_at_column() + ) + + +def _drop_link_agency_id_subtask_agencies_table(): + op.drop_table(LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME) + + +def _drop_url_auto_agency_subtask_table(): + op.drop_table(URL_AUTO_AGENCY_SUBTASK_TABLE_NAME) + + +def _create_url_auto_agency_suggestions_table(): + op.create_table( + URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME, + id_column(), + agency_id_column(), + url_id_column(), + sa.Column( + "is_unknown", + sa.Boolean(), + nullable=False + ), + created_at_column(), + updated_at_column(), + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=True + ), + sa.Column( + 'confidence', + sa.Float(), + server_default=sa.text('0.0'), + nullable=False + ), + sa.UniqueConstraint("agency_id", "url_id") + ) + + +def _drop_url_unknown_agencies_view(): + op.execute(f"DROP VIEW IF EXISTS {URL_UNKNOWN_AGENCIES_VIEW_NAME}") + + +def _drop_url_has_agency_suggestions_view(): + op.execute(f"DROP VIEW IF EXISTS {URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME}") + + +def _drop_url_annotation_flags_view(): + op.execute("DROP VIEW url_annotation_flags;") + + +def _create_old_url_annotation_flags_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) + """ + ) diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index cd68a4b5..6ba6f7c9 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -118,7 +118,7 @@ def upgrade(): def downgrade(): # Drop constraints first op.drop_constraint("uq_confirmed_url_agency", "confirmed_url_agency", type_="unique") - op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") + # op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") op.drop_constraint("uq_user_url_agency_suggestions", "user_url_agency_suggestions", type_="unique") # Drop tables diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 83c3b100..76c707ea 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -77,6 +77,14 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) ), + ScheduledTaskEntry( + operator=SyncAgenciesTaskOperator( + adb_client=self.async_core.adb_client, + pdap_client=self.pdap_client + ), + interval=IntervalEnum.DAILY, + enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) + ), ScheduledTaskEntry( operator=RunURLTasksTaskOperator(async_core=self.async_core), interval=IntervalEnum.HOURLY, @@ -88,14 +96,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True) ), - ScheduledTaskEntry( - operator=SyncAgenciesTaskOperator( - adb_client=self.async_core.adb_client, - pdap_client=self.pdap_client - ), - interval=IntervalEnum.DAILY, - enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) - ), ScheduledTaskEntry( operator=PushToHuggingFaceTaskOperator( adb_client=self.async_core.adb_client, diff --git a/src/core/tasks/url/manager.py b/src/core/tasks/url/manager.py index 399da5b0..7fc6b4e3 100644 --- a/src/core/tasks/url/manager.py +++ b/src/core/tasks/url/manager.py @@ -56,8 +56,7 @@ async def _run_task(self, entry: URLTaskEntry) -> None: print(message) await self.handler.post_to_discord(message=message) break - task_id = await self.handler.initiate_task_in_db(task_type=operator.task_type) - run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + run_info: TaskOperatorRunInfo = await operator.run_task() await self.conclude_task(run_info) if run_info.outcome == TaskOperatorOutcome.ERROR: break diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 8ac1f632..7d15c06f 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -3,7 +3,7 @@ from src.core.tasks.url.operators.agency_identification.dtos.output import GetAgencySuggestionsOutput from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py deleted file mode 100644 index 96f98f30..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py +++ /dev/null @@ -1,16 +0,0 @@ -import abc -from abc import ABC -from typing import Optional - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo - - -class AgencyIdentificationSubtaskBase(ABC): - - @abc.abstractmethod - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - raise NotImplementedError diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py index 15dddf6f..19d70db5 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py @@ -4,7 +4,7 @@ from src.core.helpers import process_match_agency_response_to_suggestions from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py new file mode 100644 index 00000000..604f21bf --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py @@ -0,0 +1,15 @@ +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase +from src.db.client.async_ import AsyncDatabaseClient + + +class HomepageMatchSubtask(AgencyIdentificationSubtaskBase): + + def __init__(self, db_client: AsyncDatabaseClient): + self.db_client = db_client + + async def run( + self, + url_id: int, + collector_metadata: dict | None = None + ) -> URLAgencySuggestionInfo: \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py index 633d84ac..307e61ee 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py @@ -8,7 +8,7 @@ from src.core.exceptions import MuckrockAPIError from src.core.helpers import process_match_agency_response_to_suggestions from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py index 7ffd57bc..5f63cd03 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py @@ -2,7 +2,7 @@ from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase @final class UnknownAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 6ef84149..a1dad90b 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,6 +1,6 @@ from src.collectors.enums import CollectorType from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ MuckrockAgencyIdentificationSubtask diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py new file mode 100644 index 00000000..59db69e6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskRunInfo(BaseModel): + error: str | None = None + + @property + def is_success(self) -> bool: + return self.error is None \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py new file mode 100644 index 00000000..02ae76a4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskOutputBase(BaseModel): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py new file mode 100644 index 00000000..b366747f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py @@ -0,0 +1,26 @@ +from abc import ABC, abstractmethod + +from src.core.tasks.url.operators.agency_identification.subtasks.templates.output import AgencyIDSubtaskOutputBase +from src.db.client.async_ import AsyncDatabaseClient + + +class SubtaskPostprocessorBase(ABC): + """ + An optional class which takes + the output of the subtask along with the subtask id + and adds additional information to the database. + """ + + def __init__( + self, + subtask_id: int, + subtask_output: AgencyIDSubtaskOutputBase, + adb_client: AsyncDatabaseClient + ): + self.subtask_id = subtask_id + self.subtask_output = subtask_output + self.adb_client = adb_client + + @abstractmethod + async def run(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py new file mode 100644 index 00000000..0aa7ce10 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -0,0 +1,29 @@ +import abc +from abc import ABC +from typing import Optional + +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.db.client.async_ import AsyncDatabaseClient + + +class AgencyIdentificationSubtaskBase(ABC): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + ) -> None: + self.adb_client = adb_client + + @abc.abstractmethod + async def meets_prerequisites(self) -> bool: + raise NotImplementedError + + @abc.abstractmethod + async def run(self) -> AgencyIDSubtaskRunInfo: + raise NotImplementedError + + @abc.abstractmethod + async def blacklist(self) -> None: + """Blacklist all invalid URLs + so they will not be picked up by this job in the future.""" diff --git a/src/db/models/impl/url/suggestion/agency/link/__init__.py b/src/db/models/impl/url/suggestion/agency/link/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/link/pydantic.py b/src/db/models/impl/url/suggestion/agency/link/pydantic.py new file mode 100644 index 00000000..8685195f --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/link/pydantic.py @@ -0,0 +1,11 @@ +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkAgencyIDSubtaskAgenciesPydantic( + BulkInsertableModel, + BulkDeletableModel, +): + subtask_id: int + agency_id: int + confidence: int diff --git a/src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py new file mode 100644 index 00000000..2b36e53a --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py @@ -0,0 +1,24 @@ +from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin +from src.db.models.templates_.base import Base + +import sqlalchemy as sa + +class LinkAgencyIDSubtaskAgencies( + Base, + CreatedAtMixin, + AgencyDependentMixin, +): + __tablename__ = "link_agency_id_subtask_agencies" + + subtask_id = sa.Column( + sa.Integer, + sa.ForeignKey("url_auto_agency_id_subtasks.id"), + nullable=False + ) + confidence = sa.Column( + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/__init__.py b/src/db/models/impl/url/suggestion/agency/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/subtask/enum.py b/src/db/models/impl/url/suggestion/agency/subtask/enum.py new file mode 100644 index 00000000..5e2a4cb8 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/enum.py @@ -0,0 +1,19 @@ +from enum import Enum + + +class AutoAgencyIDSubtask(Enum): + HOMEPAGE_MATCH = "homepage_match" + NLP_LOCATION_MATCH = "nlp_location_match" + MUCKROCK = "muckrock_match" + CKAN = "ckan_match" + +class SubtaskDetailCode(Enum): + NO_DETAILS = "no details" + BLACKLIST_CKAN_NO_CKAN_COLLECTOR = "blacklist-ckan-no ckan collector" + BLACKLIST_MUCKROCK_NO_MUCKROCK_COLLECTOR = "blacklist-muckrock-no muckrock collector" + BLACKLIST_NLP_NO_HTML = "blacklist-nlp-no html" + BLACKLIST_HOMEPAGE_ROOT_URL = "blacklist-homepage-root url" + BLACKLIST_HOMEPAGE_NO_META_URLS_ASSOCIATED_WITH_ROOT = "blacklist-homepage-no meta urls associated with root" + CASE_HOMEPAGE_SINGLE_AGENCY = "case-homepage-single agency" + CASE_HOMEPAGE_NO_DATA_SOURCES = "case-homepage-no data sources" + CASE_HOMEPAGE_MULTI_AGENCY_NONZERO_DATA_SOURCES = "case-homepage-multi agency nonzero data sources" \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py new file mode 100644 index 00000000..b6a3b776 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py @@ -0,0 +1,15 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLAutoAgencyIDSubtaskPydantic(BulkInsertableModel): + url_id: int + subtask: AutoAgencyIDSubtask + agencies_found: bool + auto_comment: str | None = None + + @classmethod + def sa_model(cls) -> type[Base]: + return URLAutoAgencyIDSubtask \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py new file mode 100644 index 00000000..ab710055 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -0,0 +1,27 @@ +from src.db.models.helpers import enum_column +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtask, SubtaskDetailCode +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.templates_.with_id import WithIDBase + +import sqlalchemy as sa + +class URLAutoAgencyIDSubtask( + WithIDBase, + URLDependentMixin, + CreatedAtMixin +): + + __tablename__ = "url_auto_agency_id_subtasks" + + subtask = enum_column( + AutoAgencyIDSubtask, + name="agency_auto_suggestion_method" + ) + agencies_found = sa.Column( + sa.Boolean(), + nullable=False + ) + detail = enum_column( + SubtaskDetailCode, + name="agency_id_subtask_detail_code", + ) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index ee357ad4..66dd2e92 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Any from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType @@ -162,14 +162,17 @@ async def sync_agencies( ) headers = await self.access_manager.jwt_header() headers['Content-Type'] = "application/json" + request_params: dict[str, Any] = { + "page": params.page + } + if params.cutoff_date is not None: + params["updated_at"] = params.cutoff_date + request_info = RequestInfo( type_=RequestType.GET, url=url, headers=headers, - params={ - "page": params.page, - "updated_at": params.cutoff_date - } + params=request_params ) response_info = await self.access_manager.make_request(request_info) return AgenciesSyncResponseInfo( diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index b8227c7c..5b56fca3 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -125,4 +125,16 @@ def batch_id_column(nullable=False) -> sa.Column: ), nullable=nullable, comment='A foreign key to the `batches` table.' + ) + +def agency_id_column(nullable=False) -> sa.Column: + return sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey( + 'agencies.agency_id', + ondelete='CASCADE' + ), + nullable=nullable, + comment='A foreign key to the `agencies` table.' ) \ No newline at end of file From 2bdaf1d051cf2abd44dcf3fac249d5d72601ea62 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 3 Sep 2025 21:16:00 -0400 Subject: [PATCH 16/33] Continue draft --- ...aee0dd79_overhaul_agency_identification.py | 19 +- pyproject.toml | 1 + .../agency/get/queries/agency_suggestion.py | 71 ++- .../agency/get/queries/next_for_annotation.py | 18 +- src/api/endpoints/review/next/query.py | 4 +- src/core/helpers.py | 48 -- .../operators/agency_identification/core.py | 112 ++--- .../agency_identification/dtos/output.py | 9 - .../agency_identification/exceptions.py | 4 + .../agency_identification/subtasks/convert.py | 54 ++ .../subtasks/impl/ckan.py | 33 -- .../subtasks/impl/ckan_}/__init__.py | 0 .../subtasks/impl/ckan_/core.py | 49 ++ .../subtasks/impl/ckan_/params.py | 6 + .../{nlp_location_match.py => ckan_/query.py} | 0 .../subtasks/impl/homepage_match.py | 15 - .../subtasks/impl/homepage_match_/__init__.py | 0 .../subtasks/impl/homepage_match_/core.py | 7 + .../subtasks/impl/homepage_match_/query.py | 0 .../subtasks/impl/muckrock.py | 48 -- .../subtasks/impl/muckrock_/__init__.py | 0 .../subtasks/impl/muckrock_/core.py | 88 ++++ .../subtasks/impl/muckrock_/params.py | 6 + .../subtasks/impl/muckrock_/query.py | 0 .../impl/nlp_location_match_/__init__.py | 0 .../impl/nlp_location_match_/constants.py | 4 + .../impl/nlp_location_match_/convert.py | 62 +++ .../subtasks/impl/nlp_location_match_/core.py | 77 +++ .../nlp_location_match_/models/__init__.py | 0 .../impl/nlp_location_match_/models/input.py | 6 + .../processor_/__init__.py | 0 .../nlp_location_match_/processor_/check.py | 9 + .../nlp_location_match_/processor_/convert.py | 27 + .../nlp_location_match_/processor_/core.py | 58 +++ .../processor_/mappings.py | 59 +++ .../processor_/models/__init__.py | 0 .../processor_/models/params.py | 6 + .../processor_/models/response.py | 9 + .../processor_/models/us_state.py | 8 + .../impl/nlp_location_match_/query.py | 36 ++ .../subtasks/impl/unknown.py | 6 +- .../agency_identification/subtasks/loader.py | 55 +- .../subtasks/models/subtask.py | 18 + .../subtasks/models/suggestion.py | 6 + .../subtasks/planner/__init__.py | 0 .../subtasks/planner/constants.py | 9 + .../subtasks/planner/core.py | 30 ++ .../subtasks/planner/queries/__init__.py | 0 .../subtasks/planner/queries/core.py | 26 + .../subtasks/planner/queries/ctes/README.md | 3 + .../subtasks/planner/queries/ctes/__init__.py | 0 .../subtasks/planner/queries/ctes/base.py | 24 + .../subtasks/planner/queries/ctes/ckan.py | 0 .../subtasks/planner/queries/ctes/homepage.py | 0 .../subtasks/planner/queries/ctes/muckrock.py | 0 .../planner/queries/ctes/nlp_location.py | 0 .../subtasks/planner/reconcile.py | 23 + .../subtasks/queries/__init__.py | 0 .../subtasks/queries/insert.py | 0 .../subtasks/templates/subtask.py | 70 ++- src/db/client/async_.py | 31 +- src/db/client/types.py | 4 - src/db/constants.py | 4 +- src/db/dto_converter.py | 65 +-- src/db/models/exceptions.py | 4 + src/db/models/impl/agency/sqlalchemy.py | 3 +- src/db/models/impl/url/core/sqlalchemy.py | 5 +- .../models/impl/url/suggestion/agency/auto.py | 23 - .../url/suggestion/agency/subtask/enum.py | 14 +- .../url/suggestion/agency/subtask/pydantic.py | 7 +- .../suggestion/agency/subtask/sqlalchemy.py | 7 +- .../suggestion/agency/suggestion/__init__.py | 0 .../agency/{link => suggestion}/pydantic.py | 4 +- .../agency/{link => suggestion}/sqlalchemy.py | 4 +- src/db/models/mixins.py | 19 +- src/db/models/views/__init__.py | 0 .../views/has_agency_auto_suggestion.py | 31 ++ src/db/models/views/url_annotations_flags.py | 49 ++ src/db/statement_composer.py | 33 +- src/external/pdap/client.py | 10 + .../search_agency_by_location/__init__.py | 0 .../dtos/search_agency_by_location/params.py | 7 + .../search_agency_by_location/response.py | 10 + src/util/alembic_helpers.py | 12 + .../integration/db/structure/test_view.py | 70 +++ .../happy_path/asserts.py | 4 +- .../happy_path/test_happy_path.py | 18 +- .../subtasks/test_ckan.py | 6 +- .../subtasks/test_muckrock.py | 6 +- .../subtasks/test_unknown.py | 2 +- tests/helpers/setup/wipe.py | 2 + uv.lock | 468 ++++++++++++++++++ 92 files changed, 1666 insertions(+), 479 deletions(-) delete mode 100644 src/core/helpers.py delete mode 100644 src/core/tasks/url/operators/agency_identification/dtos/output.py create mode 100644 src/core/tasks/url/operators/agency_identification/exceptions.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/convert.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py rename src/{db/models/impl/url/suggestion/agency/link => core/tasks/url/operators/agency_identification/subtasks/impl/ckan_}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/{nlp_location_match.py => ckan_/query.py} (100%) delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/query.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/ckan.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/homepage.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/muckrock.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py create mode 100644 src/db/models/exceptions.py delete mode 100644 src/db/models/impl/url/suggestion/agency/auto.py create mode 100644 src/db/models/impl/url/suggestion/agency/suggestion/__init__.py rename src/db/models/impl/url/suggestion/agency/{link => suggestion}/pydantic.py (53%) rename src/db/models/impl/url/suggestion/agency/{link => suggestion}/sqlalchemy.py (84%) create mode 100644 src/db/models/views/__init__.py create mode 100644 src/db/models/views/has_agency_auto_suggestion.py create mode 100644 src/db/models/views/url_annotations_flags.py create mode 100644 src/external/pdap/dtos/search_agency_by_location/__init__.py create mode 100644 src/external/pdap/dtos/search_agency_by_location/params.py create mode 100644 src/external/pdap/dtos/search_agency_by_location/response.py create mode 100644 tests/automated/integration/db/structure/test_view.py diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index 89f3e750..a255fa45 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -10,7 +10,8 @@ from alembic import op import sqlalchemy as sa -from src.util.alembic_helpers import id_column, url_id_column, created_at_column, agency_id_column, updated_at_column +from src.util.alembic_helpers import id_column, url_id_column, created_at_column, agency_id_column, updated_at_column, \ + task_id_column # revision identifiers, used by Alembic. revision: str = '70baaee0dd79' @@ -18,11 +19,11 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None -URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME: str = "url_has_agency_suggestions_view" +URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME: str = "url_has_agency_auto_suggestions_view" URL_UNKNOWN_AGENCIES_VIEW_NAME: str = "url_unknown_agencies_view" -URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_subtask" -LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "link_agency_id_subtask_agencies" +URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_id_subtasks" +LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "agency_id_subtask_suggestions" URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" @@ -33,11 +34,7 @@ SUBTASK_DETAIL_CODE_ENUM = sa.Enum( 'no details', - 'blacklist-ckan-no ckan collector', - 'blacklist-muckrock-no muckrock collector', - 'blacklist-nlp-no html', - 'blacklist-homepage-root url', - 'blacklist-homepage-no meta urls associated with root', + 'retrieval error', 'case-homepage-single agency', 'case-homepage-no data sources', 'case-homepage-multi agency nonzero data sources', @@ -128,6 +125,7 @@ def _create_url_auto_agency_subtask_table(): op.create_table( URL_AUTO_AGENCY_SUBTASK_TABLE_NAME, id_column(), + task_id_column(), url_id_column(), sa.Column( "subtask", @@ -142,7 +140,8 @@ def _create_url_auto_agency_subtask_table(): sa.Column( "detail", SUBTASK_DETAIL_CODE_ENUM, - nullable=True + server_default=sa.text("'no details'"), + nullable=False ), created_at_column() ) diff --git a/pyproject.toml b/pyproject.toml index 51eca7a2..9da9a0f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "python-dotenv~=1.0.1", "requests~=2.32.3", "side-effects>=1.6.dev0", + "spacy>=3.8.7", "sqlalchemy~=2.0.36", "starlette~=0.45.3", "tqdm>=4.64.1", diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py index 1f202263..52c58c40 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py @@ -1,10 +1,6 @@ -from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo -from src.core.enums import SuggestionType -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -19,37 +15,38 @@ def __init__( async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: # Get relevant autosuggestions and agency info, if an associated agency exists + raise NotImplementedError("Revise") - statement = ( - select( - AutomatedUrlAgencySuggestion.agency_id, - AutomatedUrlAgencySuggestion.is_unknown, - Agency.name, - Agency.state, - Agency.county, - Agency.locality - ) - .join(Agency, isouter=True) - .where(AutomatedUrlAgencySuggestion.url_id == self.url_id) - ) - raw_autosuggestions = await session.execute(statement) - autosuggestions = raw_autosuggestions.all() - agency_suggestions = [] - for autosuggestion in autosuggestions: - agency_id = autosuggestion[0] - is_unknown = autosuggestion[1] - name = autosuggestion[2] - state = autosuggestion[3] - county = autosuggestion[4] - locality = autosuggestion[5] - agency_suggestions.append( - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, - pdap_agency_id=agency_id, - agency_name=name, - state=state, - county=county, - locality=locality - ) - ) - return agency_suggestions \ No newline at end of file + # statement = ( + # select( + # AutomatedUrlAgencySuggestion.agency_id, + # AutomatedUrlAgencySuggestion.is_unknown, + # Agency.name, + # Agency.state, + # Agency.county, + # Agency.locality + # ) + # .join(Agency, isouter=True) + # .where(AutomatedUrlAgencySuggestion.url_id == self.url_id) + # ) + # raw_autosuggestions = await session.execute(statement) + # autosuggestions = raw_autosuggestions.all() + # agency_suggestions = [] + # for autosuggestion in autosuggestions: + # agency_id = autosuggestion[0] + # is_unknown = autosuggestion[1] + # name = autosuggestion[2] + # state = autosuggestion[3] + # county = autosuggestion[4] + # locality = autosuggestion[5] + # agency_suggestions.append( + # GetNextURLForAgencyAgencyInfo( + # suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, + # pdap_agency_id=agency_id, + # agency_name=name, + # state=state, + # county=county, + # locality=locality + # ) + # ) + # return agency_suggestions \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index ea0ae85e..e8f22870 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -12,7 +12,6 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -63,14 +62,15 @@ async def run( ) ) # Must have extant autosuggestions - .join(AutomatedUrlAgencySuggestion, isouter=True) - .where( - exists( - select(AutomatedUrlAgencySuggestion). - where(AutomatedUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) - ) + # TODO: Replace with new logic + # .join(AutomatedUrlAgencySuggestion, isouter=True) + # .where( + # exists( + # select(AutomatedUrlAgencySuggestion). + # where(AutomatedUrlAgencySuggestion.url_id == URL.id). + # correlate(URL) + # ) + # ) # Must not have confirmed agencies .join(LinkURLAgency, isouter=True) .where( diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index e7314edd..8c50a7af 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -17,7 +17,6 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase @@ -43,7 +42,8 @@ def __init__(self, batch_id: int | None = None): ] # The below relationships are joined to entities that are joined to the URL self.double_join_relationships = [ - (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), + # TODO: Replace with new logic + # (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, LinkURLAgency.agency) ] diff --git a/src/core/helpers.py b/src/core/helpers.py deleted file mode 100644 index eeb951fe..00000000 --- a/src/core/helpers.py +++ /dev/null @@ -1,48 +0,0 @@ -from src.core.enums import SuggestionType -from src.core.exceptions import MatchAgencyError -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus - - -def process_match_agency_response_to_suggestions( - url_id: int, - match_agency_response: MatchAgencyResponse -) -> list[URLAgencySuggestionInfo]: - if match_agency_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match = match_agency_response.matches[0] - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=int(match.id), - agency_name=match.submitted_name, - state=match.state, - county=match.county, - ) - ] - if match_agency_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - - if match_agency_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise MatchAgencyError( - f"Unknown Match Agency Response Status: {match_agency_response.status}" - ) - - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=match.id, - agency_name=match.submitted_name, - state=match.state, - county=match.county, - locality=match.locality - ) - for match in match_agency_response.matches - ] diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 7d15c06f..9c2e00f4 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,14 +1,12 @@ -from src.collectors.enums import CollectorType -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.output import GetAgencySuggestionsOutput -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.planner.core import AgencyIDSubtaskPlanner +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType class AgencyIdentificationTaskOperator(URLTaskOperatorBase): @@ -17,93 +15,49 @@ def __init__( self, adb_client: AsyncDatabaseClient, loader: AgencyIdentificationSubtaskLoader, + planner: AgencyIDSubtaskPlanner, ): super().__init__(adb_client) self.loader = loader + self._subtask: AutoAgencyIDSubtaskType | None = None + self.planner = planner @property def task_type(self) -> TaskType: return TaskType.AGENCY_IDENTIFICATION async def meets_task_prerequisites(self) -> bool: - has_urls_without_agency_suggestions = await self.adb_client.has_urls_without_agency_suggestions() - return has_urls_without_agency_suggestions - - async def get_pending_urls_without_agency_identification(self) -> list[AgencyIdentificationTDO]: - return await self.adb_client.get_urls_without_agency_suggestions() - - async def get_subtask( + """ + Modifies: + - self._subtask + """ + subtask_type: AutoAgencyIDSubtaskType | None = await self.planner.plan_next_subtask() + if subtask_type is None: + return False + self._subtask = subtask_type + return True + + + async def load_subtask( self, - collector_type: CollectorType - ) -> AgencyIdentificationSubtaskBase: + subtask_type: AutoAgencyIDSubtaskType + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - return await self.loader.load_subtask(collector_type) + return await self.loader.load_subtask(subtask_type) + + async def plan_next_subtask(self) -> AutoAgencyIDSubtaskType | None: + return await self.planner.plan_next_subtask() @staticmethod async def run_subtask( - subtask: AgencyIdentificationSubtaskBase, - url_id: int, - collector_metadata: dict | None - ) -> list[URLAgencySuggestionInfo]: - return await subtask.run( - url_id=url_id, - collector_metadata=collector_metadata - ) + subtask_operator: AgencyIDSubtaskOperatorBase, + ) -> AgencyIDSubtaskRunInfo: + return await subtask_operator.run() async def inner_task_logic(self) -> None: - tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - output = await self._get_agency_suggestions(tdos) - - await self._process_agency_suggestions(output.agency_suggestions) - await self.adb_client.add_url_error_infos(output.error_infos) - - async def _process_agency_suggestions( - self, - suggestions: list[URLAgencySuggestionInfo] - ) -> None: - non_unknown_agency_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.UNKNOWN - ] - await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) - confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type == SuggestionType.CONFIRMED - ] - await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - non_confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.CONFIRMED - ] - await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) - - async def _get_agency_suggestions( - self, - tdos: list[AgencyIdentificationTDO] - ) -> GetAgencySuggestionsOutput: - error_infos = [] - all_agency_suggestions = [] - for tdo in tdos: - subtask = await self.get_subtask(tdo.collector_type) - try: - new_agency_suggestions = await self.run_subtask( - subtask, - tdo.url_id, - tdo.collector_metadata - ) - all_agency_suggestions.extend(new_agency_suggestions) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) - output = GetAgencySuggestionsOutput( - agency_suggestions=all_agency_suggestions, - error_infos=error_infos - ) - return output + subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) + run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) + if not run_info.is_success: + raise SubtaskError(run_info.error) diff --git a/src/core/tasks/url/operators/agency_identification/dtos/output.py b/src/core/tasks/url/operators/agency_identification/dtos/output.py deleted file mode 100644 index d7381129..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/output.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo - - -class GetAgencySuggestionsOutput(BaseModel): - error_infos: list[URLErrorPydanticInfo] - agency_suggestions: list[URLAgencySuggestionInfo] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/exceptions.py b/src/core/tasks/url/operators/agency_identification/exceptions.py new file mode 100644 index 00000000..709189e3 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/exceptions.py @@ -0,0 +1,4 @@ + + +class SubtaskError(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py new file mode 100644 index 00000000..976e6e4a --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -0,0 +1,54 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.enums import MatchAgencyResponseStatus + +def convert_match_agency_response_to_subtask_data( + url_id: int, + response: MatchAgencyResponse, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int +): + suggestions: list[AgencySuggestion] = \ + _convert_match_agency_response_to_suggestions( + response + ) + agencies_found: bool = len(suggestions) > 0 + subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( + url_id=url_id, + subtask=subtask_type, + agencies_found=agencies_found, + task_id=task_id + ) + return AutoAgencyIDSubtaskData( + pydantic_model=subtask_pydantic, + suggestions=suggestions + ) + +def _convert_match_agency_response_to_suggestions( + match_response: MatchAgencyResponse, +) -> list[AgencySuggestion]: + if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: + match_info: MatchAgencyInfo = match_response.matches[0] + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=100 + ) + ] + if match_response.status == MatchAgencyResponseStatus.NO_MATCH: + return [] + if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: + raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") + total_confidence: int = 100 + confidence_per_match: int = total_confidence // len(match_response.matches) + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=confidence_per_match + ) + for match_info in match_response.matches + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py deleted file mode 100644 index 19d70db5..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class CKANAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - pdap_client: PDAPClient - ): - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - agency_name = collector_metadata["agency_name"] - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/db/models/impl/url/suggestion/agency/link/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/agency/link/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py new file mode 100644 index 00000000..925411f1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -0,0 +1,49 @@ +from typing import final + +from typing_extensions import override + +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ + AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class CKANAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[CKANAgencyIDSubtaskParams] = await self._get_params() + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + agency_name: str = param.collector_metadata["agency_name"] + response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=response, + subtask_type=AutoAgencyIDSubtaskType.CKAN, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + async def _get_params(self) -> list[CKANAgencyIDSubtaskParams]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py new file mode 100644 index 00000000..ce4b7ce1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CKANAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py deleted file mode 100644 index 604f21bf..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match.py +++ /dev/null @@ -1,15 +0,0 @@ -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase -from src.db.client.async_ import AsyncDatabaseClient - - -class HomepageMatchSubtask(AgencyIdentificationSubtaskBase): - - def __init__(self, db_client: AsyncDatabaseClient): - self.db_client = db_client - - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> URLAgencySuggestionInfo: \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py new file mode 100644 index 00000000..745223d6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py @@ -0,0 +1,7 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase + + +class HomepageMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): + + async def inner_logic(self) -> None: + raise NotImplementedError() \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py deleted file mode 100644 index 307e61ee..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.exceptions import MuckrockAPIError -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class MuckrockAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - muckrock_api_interface: MuckrockAPIInterface, - pdap_client: PDAPClient - ): - self.muckrock_api_interface = muckrock_api_interface - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - muckrock_agency_id = collector_metadata["agency"] - agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( - muckrock_agency_id=muckrock_agency_id - ) - if agency_lookup_response.type != AgencyLookupResponseType.FOUND: - raise MuckrockAPIError( - f"Failed to lookup muckrock agency: {muckrock_agency_id}:" - f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" - ) - - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py new file mode 100644 index 00000000..28ee8f29 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -0,0 +1,88 @@ +from typing import final + +from typing_extensions import override + +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class MuckrockAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + muckrock_api_interface: MuckrockAPIInterface, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.muckrock_api_interface = muckrock_api_interface + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[MuckrockAgencyIDSubtaskParams] = await self._get_params() + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + muckrock_agency_id: int = param.collector_metadata["agency"] + agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( + muckrock_agency_id=muckrock_agency_id + ) + if agency_lookup_response.type != AgencyLookupResponseType.FOUND: + data: AutoAgencyIDSubtaskData = await self._error_subtask_data( + url_id=param.url_id, + muckrock_agency_id=muckrock_agency_id, + agency_lookup_response=agency_lookup_response + ) + subtask_data_list.append(data) + continue + match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_lookup_response.name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=match_agency_response, + subtask_type=AutoAgencyIDSubtaskType.CKAN, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + + async def _error_subtask_data( + self, + url_id: int, + muckrock_agency_id: int, + agency_lookup_response: AgencyLookupResponse + ) -> AutoAgencyIDSubtaskData: + pydantic_model = URLAutoAgencyIDSubtaskPydantic( + task_id=self.task_id, + url_id=url_id, + subtask=AutoAgencyIDSubtaskType.MUCKROCK, + agencies_found=False, + detail=SubtaskDetailCode.RETRIEVAL_ERROR + ) + error: str = f"Failed to lookup muckrock agency: {muckrock_agency_id}:" + \ + f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" + return AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=[], + error=error + ) + + async def _get_params(self) -> list[MuckrockAgencyIDSubtaskParams]: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py new file mode 100644 index 00000000..6010f022 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class MuckrockAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py new file mode 100644 index 00000000..fb8f22ba --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py @@ -0,0 +1,4 @@ + + +ITERATIONS_PER_SUBTASK = 1 +NUMBER_OF_ENTRIES_PER_ITERATION = 10 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py new file mode 100644 index 00000000..d2f14477 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py @@ -0,0 +1,62 @@ +from math import ceil + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +def convert_nlp_response_to_search_agency_by_location_params( + url_id: int, + nlp_response: NLPLocationMatchResponse, +) -> SearchAgencyByLocationParams: + return SearchAgencyByLocationParams( + request_id=url_id, + locations=nlp_response.locations, + state_iso=nlp_response.us_state.iso, + ) + +def convert_search_agency_responses_to_subtask_data_list( + responses: list[SearchAgencyByLocationResponse], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for response in responses: + subtask_data: AutoAgencyIDSubtaskData = \ + convert_search_agency_response_to_subtask_data( + response=response, + task_id=task_id, + ) + subtask_data_list.append(subtask_data) + return subtask_data_list + +def convert_search_agency_response_to_subtask_data( + response: SearchAgencyByLocationResponse, + task_id: int +) -> AutoAgencyIDSubtaskData: + suggestions: list[AgencySuggestion] = [] + url_id: int = response.request_id + for result in response.results: + agency_id: int = result.agency_id + similarity: float = result.similarity + confidence: int = ceil(similarity * 100) + suggestion: AgencySuggestion = AgencySuggestion( + agency_id=agency_id, + confidence=confidence, + ) + suggestions.append(suggestion) + + pydantic_model = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + subtask=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=len(suggestions) > 0 + ) + return AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=suggestions + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py new file mode 100644 index 00000000..3999cc42 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -0,0 +1,77 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + ITERATIONS_PER_SUBTASK +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.convert import \ + convert_nlp_response_to_search_agency_by_location_params, convert_search_agency_responses_to_subtask_data_list +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.core import NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query import \ + GetNLPLocationMatchSubtaskInputQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class NLPLocationMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient, + processor: NLPProcessor + ) -> None: + super().__init__(adb_client, task_id) + self.processor = processor + self.pdap_client = pdap_client + + async def inner_logic(self) -> None: + for iteration in range(ITERATIONS_PER_SUBTASK): + inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + if len(inputs) == 0: + break + await self.run_subtask_iteration(inputs) + + async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + search_params: list[SearchAgencyByLocationParams] = [] + for input_ in inputs: + nlp_response: NLPLocationMatchResponse = await self._get_location_match(input_.html) + search_param: SearchAgencyByLocationParams = \ + convert_nlp_response_to_search_agency_by_location_params( + url_id=input_.url_id, + nlp_response=nlp_response, + ) + search_params.append(search_param) + + search_responses: list[SearchAgencyByLocationResponse] = \ + await self._get_pdap_info(search_params) + + subtask_data_list: list[AutoAgencyIDSubtaskData] = \ + convert_search_agency_responses_to_subtask_data_list( + responses=search_responses, + task_id=self.task_id, + ) + + await self._upload_subtask_data(subtask_data_list) + + async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: + return await self.adb_client.run_query_builder( + query_builder=GetNLPLocationMatchSubtaskInputQueryBuilder(), + ) + + async def _get_pdap_info( + self, + params: list[SearchAgencyByLocationParams] + ) -> list[SearchAgencyByLocationResponse]: + return await self.pdap_client.search_agency_by_location(params) + + async def _get_location_match( + self, + html: str + ) -> NLPLocationMatchResponse: + return self.processor.parse_for_locations(html) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py new file mode 100644 index 00000000..398c1504 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchSubtaskInput(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py new file mode 100644 index 00000000..2019cbcf --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py @@ -0,0 +1,9 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO + + +def is_iso_us_state(iso: str) -> bool: + return iso in US_STATE_ISO_TO_NAME + +def is_name_us_state(name: str) -> bool: + return name in US_NAME_TO_STATE_ISO \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py new file mode 100644 index 00000000..f29bb11b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ + USState + + +def convert_us_state_iso_to_us_state(iso: str) -> USState | None: + name: str | None = US_STATE_ISO_TO_NAME.get(iso, None) + + if name is None: + return None + + return USState( + name=name, + iso=iso + ) + +def convert_us_state_name_to_us_state(name: str) -> USState | None: + iso: str | None = US_NAME_TO_STATE_ISO.get(name, None) + + if iso is None: + return None + + return USState( + name=name, + iso=iso + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py new file mode 100644 index 00000000..45b8d235 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py @@ -0,0 +1,58 @@ +from collections import Counter +from typing import Mapping + +from spacy import Language +from spacy.tokens import Doc + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.check import \ + is_name_us_state, is_iso_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.convert import \ + convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ + USState + + +class NLPProcessor: + + def __init__( + self, + model: Language + ): + self._model: Language = model + + def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: + doc: Doc = self._model(html) + us_state_counter: Counter[USState] = Counter() + location_counter: Counter[str] = Counter() + + for ent in doc.ents: + if ent.label_ != "GPE": # Geopolitical Entity + continue + text: str = ent.text + if is_name_us_state(text): + us_state: USState | None = convert_us_state_name_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + if is_iso_us_state(text): + us_state: USState | None = convert_us_state_iso_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + location_counter[text] += 1 + + most_common_us_state: USState | None = us_state_counter.most_common(1)[0][0] + top_5_locations_raw: list[tuple[str, int]] = location_counter.most_common(5) + top_5_locations: list[str] = [] + for location, _ in top_5_locations_raw: + top_5_locations.append(location) + + return NLPLocationMatchResponse( + us_state=most_common_us_state, + locations=top_5_locations + ) + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py new file mode 100644 index 00000000..03417480 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py @@ -0,0 +1,59 @@ + + +US_STATE_ISO_TO_NAME: dict[str, str] = { + 'AL': 'Alabama', + 'AK': 'Alaska', + 'AZ': 'Arizona', + 'AR': 'Arkansas', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'HI': 'Hawaii', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'IA': 'Iowa', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'ME': 'Maine', + 'MD': 'Maryland', + 'MA': 'Massachusetts', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MS': 'Mississippi', + 'MO': 'Missouri', + 'MT': 'Montana', + 'NE': 'Nebraska', + 'NV': 'Nevada', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NY': 'New York', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VT': 'Vermont', + 'VA': 'Virginia', + 'WA': 'Washington', + 'WV': 'West Virginia', + 'WI': 'Wisconsin', + 'WY': 'Wyoming', + 'DC': 'District of Columbia', +} + +US_NAME_TO_STATE_ISO: dict[str, str] = { + name: iso for iso, name in US_STATE_ISO_TO_NAME.items() +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py new file mode 100644 index 00000000..79378612 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchParams(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py new file mode 100644 index 00000000..bd536dd5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ + USState + + +class NLPLocationMatchResponse(BaseModel): + locations: list[str] + us_state: USState | None \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py new file mode 100644 index 00000000..0b29771f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, ConfigDict + + +class USState(BaseModel): + model_config = ConfigDict(frozen=True) + + name: str + iso: str diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py new file mode 100644 index 00000000..7544ebaa --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py @@ -0,0 +1,36 @@ +from typing import Any + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.queries.base.builder import QueryBuilderBase + + +class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[NLPLocationMatchSubtaskInput]: + + query = ( + select( + URL.id, + URLCompressedHTML.compressed_html + ) + .join( + URLCompressedHTML, + URLCompressedHTML.url_id == URL.id + ) + ) + + # TODO: Add additional joins and where conditions + # TODO: Maybe leverage CTEs from survey query to get the precise URL ids + # without having to redo the logic here + + + # TODO: Add limit leveraging NUMBER_OF_ENTRIES_PER_ITERATION constant diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py index 5f63cd03..cd741c5b 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py @@ -2,17 +2,17 @@ from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase @final -class UnknownAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): +class UnknownAgencyIdentificationSubtask(AgencyIDSubtaskOperatorBase): """A subtask that returns an unknown suggestion. Used in cases where the agency cannot be reliably inferred from the source. """ @override - async def run( + async def inner_logic( self, url_id: int, collector_metadata: dict | None = None diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index a1dad90b..493a94d2 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,10 +1,20 @@ +import spacy + from src.collectors.enums import CollectorType from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIdentificationSubtaskBase -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ - MuckrockAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.core import \ + HomepageMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import \ + MuckrockAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ + NLPLocationMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient @@ -14,25 +24,48 @@ class AgencyIdentificationSubtaskLoader: def __init__( self, pdap_client: PDAPClient, - muckrock_api_interface: MuckrockAPIInterface + muckrock_api_interface: MuckrockAPIInterface, + adb_client: AsyncDatabaseClient ): self.pdap_client = pdap_client self.muckrock_api_interface = muckrock_api_interface + self.adb_client = adb_client - async def _load_muckrock_subtask(self) -> MuckrockAgencyIdentificationSubtask: - return MuckrockAgencyIdentificationSubtask( + async def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: + return MuckrockAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, muckrock_api_interface=self.muckrock_api_interface, pdap_client=self.pdap_client ) - async def _load_ckan_subtask(self) -> CKANAgencyIdentificationSubtask: - return CKANAgencyIdentificationSubtask( + async def _load_ckan_subtask(self, task_id: int) -> CKANAgencyIDSubtaskOperator: + return CKANAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, pdap_client=self.pdap_client ) - async def load_subtask(self, collector_type: CollectorType) -> AgencyIdentificationSubtaskBase: + async def _load_homepage_match_subtask(self, task_id: int) -> HomepageMatchSubtaskOperator: + return HomepageMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + ) + + async def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubtaskOperator: + return NLPLocationMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + pdap_client=self.pdap_client, + processor=NLPProcessor( + spacy.load('en_core_web_trf', disable=['parser']) + ) + ) + + + async def load_subtask(self, subtask_type: AutoAgencyIDSubtaskType) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - match collector_type: + match subtask_type: case CollectorType.MUCKROCK_SIMPLE_SEARCH: return await self._load_muckrock_subtask() case CollectorType.MUCKROCK_COUNTY_SEARCH: diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py new file mode 100644 index 00000000..7da0a8f5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic + + +class AutoAgencyIDSubtaskData(BaseModel): + pydantic_model: URLAutoAgencyIDSubtaskPydantic + suggestions: list[AgencySuggestion] + error: str | None = None + + @property + def has_error(self) -> bool: + return self.error is not None + + @property + def url_id(self) -> int: + return self.pydantic_model.url_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py new file mode 100644 index 00000000..5dbc62ad --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencySuggestion(BaseModel): + agency_id: int + confidence: int \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py new file mode 100644 index 00000000..c7cf111e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py @@ -0,0 +1,9 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +# Determines priority of subtasks, all else being equal. +SUBTASK_HIERARCHY: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType.CKAN, + AutoAgencyIDSubtaskType.MUCKROCK, + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH +] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py new file mode 100644 index 00000000..4968cf4e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py @@ -0,0 +1,30 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.planner.queries.core import \ + AgencyIDSubtaskSurveyQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.planner.reconcile import reconcile_tiebreakers +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +class AgencyIDSubtaskPlanner: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + ) -> None: + self.adb_client = adb_client + + # TODO: Add test to confirm properly returns one, multiple, or None + async def plan_next_subtask(self) -> AutoAgencyIDSubtaskType | None: + + applicable_subtasks: list[AutoAgencyIDSubtaskType] = \ + await self.adb_client.run_query_builder( + AgencyIDSubtaskSurveyQueryBuilder() + ) + + # Reconcile tiebreakers + if len(applicable_subtasks) == 0: + return None + if len(applicable_subtasks) > 1: + return await reconcile_tiebreakers(applicable_subtasks) + return applicable_subtasks[0] + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py new file mode 100644 index 00000000..7765612d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py @@ -0,0 +1,26 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.planner.constants import SUBTASK_HIERARCHY +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.queries.base.builder import QueryBuilderBase + + +class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): + """ + Survey applicable URLs to determine next subtask to run + + URLs are "inapplicable" if they have any of the following properties: + - Are validated via FlagURLValidated model + - Have at least one annotation with agency suggestion with confidence >= 95 + - Have all possible subtasks completed + + Returns a list of one or more subtasks to run + based on which subtask(s) have the most applicable URLs + (or an empty list if no subtasks have applicable URLs) + """ + + async def run(self, session: AsyncSession) -> list[AutoAgencyIDSubtaskType]: + raise NotImplementedError + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md new file mode 100644 index 00000000..38324fa7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md @@ -0,0 +1,3 @@ +Contains CTEs for determining validity for each subtask. + +Each file corresponds to the validity CTE for that subtask. \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py new file mode 100644 index 00000000..85820123 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py @@ -0,0 +1,24 @@ +from sqlalchemy import CTE, Column + + +class PrereqCTE: + """ + Base class for CTEs that determine validity for each subtask. + + Single column CTEs intended to be left-joined and considered valid only + if the joined row is not null. + """ + + def __init__( + self, + cte: CTE + ) -> None: + self._cte = cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self.cte.columns[0] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/ckan.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/homepage.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/muckrock.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py new file mode 100644 index 00000000..f0575f0d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py @@ -0,0 +1,23 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.planner.constants import SUBTASK_HIERARCHY +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +# TODO: Add test to confirm expected behavior +async def reconcile_tiebreakers( + subtasks: list[AutoAgencyIDSubtaskType] +) -> AutoAgencyIDSubtaskType: + """In the case of multiple subtasks being applicable, + determine which one to run based on priority.""" + + # TODO: Figure out why type hints are mismatched with this + rank: dict[AutoAgencyIDSubtaskType, int] = { + subtask: rank + for rank, subtask in enumerate(SUBTASK_HIERARCHY) + } + + def key(subtask: AutoAgencyIDSubtaskType) -> tuple[int, str]: + r = rank.get(subtask, None) + if r is None: + raise ValueError(f"Subtask {subtask} not found in hierarchy") + return r, subtask.value + + return min(subtasks, key=key) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py index 0aa7ce10..2ff45c3e 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -1,29 +1,75 @@ import abc from abc import ABC -from typing import Optional -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic -class AgencyIdentificationSubtaskBase(ABC): +class AgencyIDSubtaskOperatorBase(ABC): def __init__( self, adb_client: AsyncDatabaseClient, + task_id: int ) -> None: - self.adb_client = adb_client + self.adb_client: AsyncDatabaseClient = adb_client + self.task_id: int = task_id - @abc.abstractmethod - async def meets_prerequisites(self) -> bool: - raise NotImplementedError + async def run(self) -> AgencyIDSubtaskRunInfo: + try: + await self.inner_logic() + except Exception as e: + return AgencyIDSubtaskRunInfo( + error=str(e) + ) + return AgencyIDSubtaskRunInfo() @abc.abstractmethod - async def run(self) -> AgencyIDSubtaskRunInfo: + async def inner_logic(self) -> AgencyIDSubtaskRunInfo: raise NotImplementedError - @abc.abstractmethod - async def blacklist(self) -> None: - """Blacklist all invalid URLs - so they will not be picked up by this job in the future.""" + async def _upload_subtask_data( + self, + subtask_data_list: list[AutoAgencyIDSubtaskData] + ) -> None: + + subtask_models: list[URLAutoAgencyIDSubtaskPydantic] = [ + subtask_data.pydantic_model + for subtask_data in subtask_data_list + ] + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=subtask_models, + return_ids=True + ) + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for subtask_id, subtask_info in zip(subtask_ids, subtask_data_list): + for suggestion in subtask_info.suggestions: + suggestion_pydantic = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=suggestion.agency_id, + confidence=suggestion.confidence, + ) + suggestions.append(suggestion_pydantic) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + error_infos: list[URLErrorPydanticInfo] = [] + for subtask_info in subtask_data_list: + if not subtask_info.has_error: + continue + error_info = URLErrorPydanticInfo( + url_id=subtask_info.url_id, + error=subtask_info.error, + task_id=self.task_id, + ) + error_infos.append(error_info) + + await self.adb_client.bulk_insert( + models=error_infos, + ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 14a03f3b..93ec996c 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -52,7 +52,7 @@ from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder -from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus +from src.core.enums import BatchStatus, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters @@ -60,8 +60,6 @@ from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ - convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ GetDataSourcesSyncParametersQueryBuilder @@ -71,9 +69,6 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.core import \ UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ - GetPendingURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.agency_identification.queries.has_urls_without_agency_suggestions import \ HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO @@ -126,7 +121,6 @@ from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.probed_for_404 import URLProbedFor404 -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -145,7 +139,6 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo @@ -725,11 +718,6 @@ async def get_tasks( async def has_urls_without_agency_suggestions(self) -> bool: return await self.run_query_builder(HasURLsWithoutAgencySuggestionsQueryBuilder()) - async def get_urls_without_agency_suggestions( - self - ) -> list[AgencyIdentificationTDO]: - """Retrieve URLs without confirmed or suggested agencies.""" - return await self.run_query_builder(GetPendingURLsWithoutAgencySuggestionsQueryBuilder()) async def get_next_url_agency_for_annotation( self, @@ -783,14 +771,15 @@ async def add_agency_auto_suggestions( session: AsyncSession, suggestions: list[URLAgencySuggestionInfo] ): - for suggestion in suggestions: - url_agency_suggestion = AutomatedUrlAgencySuggestion( - url_id=suggestion.url_id, - agency_id=suggestion.pdap_agency_id, - is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN, - confidence=0 - ) - session.add(url_agency_suggestion) + raise NotImplementedError("Revise") + # for suggestion in suggestions: + # url_agency_suggestion = AutomatedUrlAgencySuggestion( + # url_id=suggestion.url_id, + # agency_id=suggestion.pdap_agency_id, + # is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN, + # confidence=0 + # ) + # session.add(url_agency_suggestion) @session_manager async def add_agency_manual_suggestion( diff --git a/src/db/client/types.py b/src/db/client/types.py index efdfdc72..02c0e39b 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,9 +1,5 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion -AutoSuggestionModel = AutoRelevantSuggestion or AutoRecordTypeSuggestion or AutomatedUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 505a6e58..3bab368f 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,4 +1,3 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -12,7 +11,8 @@ ALL_ANNOTATION_MODELS = [ AutoRecordTypeSuggestion, AutoRelevantSuggestion, - AutomatedUrlAgencySuggestion, + # TODO: Revise + # AutomatedUrlAgencySuggestion, UserRelevantSuggestion, UserRecordTypeSuggestion, UserUrlAgencySuggestion diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 979a3b51..39b53b89 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -1,21 +1,18 @@ -from typing import Optional - from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.api.endpoints.review.next.dto import FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, \ - FinalReviewAnnotationAgencyAutoInfo, FinalReviewAnnotationAgencyInfo + FinalReviewAnnotationAgencyInfo from src.core.enums import RecordType, SuggestionType from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion @@ -65,47 +62,6 @@ def final_review_annotation_record_type_info( user=user_value ) - @staticmethod - def final_review_annotation_agency_auto_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion] - ) -> FinalReviewAnnotationAgencyAutoInfo: - - if len(automated_agency_suggestions) == 0: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[] - ) - - if len(automated_agency_suggestions) == 1: - suggestion = automated_agency_suggestions[0] - unknown = suggestion.is_unknown - else: - unknown = False - - if unknown: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - ) - - return FinalReviewAnnotationAgencyAutoInfo( - unknown=unknown, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) for suggestion in automated_agency_suggestions - ] - ) - @staticmethod def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( user_url_agency_suggestion: UserUrlAgencySuggestion @@ -148,7 +104,8 @@ def confirmed_agencies_to_final_review_annotation_agency_info( @staticmethod def final_review_annotation_agency_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], + # TODO: Revise + automated_agency_suggestions: list[None], confirmed_agencies: list[LinkURLAgency], user_agency_suggestion: UserUrlAgencySuggestion ): @@ -157,9 +114,11 @@ def final_review_annotation_agency_info( confirmed_agencies ) - agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - automated_agency_suggestions - ) + # TODO: Revise + # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + # automated_agency_suggestions + # ) + agency_auto_info = None agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( user_agency_suggestion diff --git a/src/db/models/exceptions.py b/src/db/models/exceptions.py new file mode 100644 index 00000000..491aa9a4 --- /dev/null +++ b/src/db/models/exceptions.py @@ -0,0 +1,4 @@ + + +class WriteToViewError(Exception): + pass \ No newline at end of file diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 556bde88..9477ecef 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -25,6 +25,7 @@ class Agency( locality = Column(String, nullable=True) # Relationships - automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") + # TODO: Revise + # automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index b9c38732..9548136d 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -50,8 +50,9 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): secondary="link_task_urls", back_populates="urls", ) - automated_agency_suggestions = relationship( - "AutomatedUrlAgencySuggestion", back_populates="url") + # TODO: Revise + # automated_agency_suggestions = relationship( + # "AutomatedUrlAgencySuggestion", back_populates="url") user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( diff --git a/src/db/models/impl/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py deleted file mode 100644 index 50fd5e03..00000000 --- a/src/db/models/impl/url/suggestion/agency/auto.py +++ /dev/null @@ -1,23 +0,0 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, Float -from sqlalchemy.orm import relationship - -from src.db.models.helpers import get_agency_id_foreign_column -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase -from src.db.models.templates_.with_id import WithIDBase - - -class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): - __tablename__ = "url_auto_agency_suggestions" - - agency_id = get_agency_id_foreign_column(nullable=True) - is_unknown = Column(Boolean, nullable=True) - confidence = Column(Float, nullable=False) - - - agency = relationship("Agency", back_populates="automated_suggestions") - url = relationship("URL", back_populates="automated_agency_suggestions") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", name="uq_automated_url_agency_suggestions"), - ) diff --git a/src/db/models/impl/url/suggestion/agency/subtask/enum.py b/src/db/models/impl/url/suggestion/agency/subtask/enum.py index 5e2a4cb8..33730954 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/enum.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/enum.py @@ -1,7 +1,7 @@ from enum import Enum -class AutoAgencyIDSubtask(Enum): +class AutoAgencyIDSubtaskType(Enum): HOMEPAGE_MATCH = "homepage_match" NLP_LOCATION_MATCH = "nlp_location_match" MUCKROCK = "muckrock_match" @@ -9,11 +9,7 @@ class AutoAgencyIDSubtask(Enum): class SubtaskDetailCode(Enum): NO_DETAILS = "no details" - BLACKLIST_CKAN_NO_CKAN_COLLECTOR = "blacklist-ckan-no ckan collector" - BLACKLIST_MUCKROCK_NO_MUCKROCK_COLLECTOR = "blacklist-muckrock-no muckrock collector" - BLACKLIST_NLP_NO_HTML = "blacklist-nlp-no html" - BLACKLIST_HOMEPAGE_ROOT_URL = "blacklist-homepage-root url" - BLACKLIST_HOMEPAGE_NO_META_URLS_ASSOCIATED_WITH_ROOT = "blacklist-homepage-no meta urls associated with root" - CASE_HOMEPAGE_SINGLE_AGENCY = "case-homepage-single agency" - CASE_HOMEPAGE_NO_DATA_SOURCES = "case-homepage-no data sources" - CASE_HOMEPAGE_MULTI_AGENCY_NONZERO_DATA_SOURCES = "case-homepage-multi agency nonzero data sources" \ No newline at end of file + RETRIEVAL_ERROR = "retrieval error" + HOMEPAGE_SINGLE_AGENCY = "homepage-single agency" + HOMEPAGE_NO_DATA_SOURCES = "homepage-no data sources" + HOMEPAGE_MULTI_AGENCY_NONZERO_DATA_SOURCES = "homepage-multi agency nonzero data sources" \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py index b6a3b776..1dd3d217 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py @@ -1,14 +1,15 @@ -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel class URLAutoAgencyIDSubtaskPydantic(BulkInsertableModel): + task_id: int url_id: int - subtask: AutoAgencyIDSubtask + subtask: AutoAgencyIDSubtaskType agencies_found: bool - auto_comment: str | None = None + detail: SubtaskDetailCode = SubtaskDetailCode.NO_DETAILS @classmethod def sa_model(cls) -> type[Base]: diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py index ab710055..ec04d471 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -1,6 +1,6 @@ from src.db.models.helpers import enum_column -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtask, SubtaskDetailCode -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, TaskDependentMixin from src.db.models.templates_.with_id import WithIDBase import sqlalchemy as sa @@ -8,13 +8,14 @@ class URLAutoAgencyIDSubtask( WithIDBase, URLDependentMixin, + TaskDependentMixin, CreatedAtMixin ): __tablename__ = "url_auto_agency_id_subtasks" subtask = enum_column( - AutoAgencyIDSubtask, + AutoAgencyIDSubtaskType, name="agency_auto_suggestion_method" ) agencies_found = sa.Column( diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py b/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/link/pydantic.py b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py similarity index 53% rename from src/db/models/impl/url/suggestion/agency/link/pydantic.py rename to src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py index 8685195f..e709957a 100644 --- a/src/db/models/impl/url/suggestion/agency/link/pydantic.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py @@ -1,10 +1,8 @@ -from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel -class LinkAgencyIDSubtaskAgenciesPydantic( +class AgencyIDSubtaskSuggestionPydantic( BulkInsertableModel, - BulkDeletableModel, ): subtask_id: int agency_id: int diff --git a/src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py similarity index 84% rename from src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py rename to src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index 2b36e53a..0bc956fd 100644 --- a/src/db/models/impl/url/suggestion/agency/link/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -3,12 +3,12 @@ import sqlalchemy as sa -class LinkAgencyIDSubtaskAgencies( +class AgencyIDSubtaskSuggestion( Base, CreatedAtMixin, AgencyDependentMixin, ): - __tablename__ = "link_agency_id_subtask_agencies" + __tablename__ = "agency_id_subtask_suggestions" subtask_id = sa.Column( sa.Integer, diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 541e5d09..d0dbbcab 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -1,5 +1,8 @@ -from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP +from typing import ClassVar +from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event + +from src.db.models.exceptions import WriteToViewError from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT @@ -58,3 +61,17 @@ class UpdatedAtMixin: server_default=CURRENT_TIME_SERVER_DEFAULT, onupdate=CURRENT_TIME_SERVER_DEFAULT ) + +class ViewMixin: + """Attach to any mapped class that represents a DB view.""" + __is_view__: ClassVar[bool] = True + + @classmethod + def __declare_last__(cls) -> None: + # Block writes on this mapped class + for evt in ("before_insert", "before_update", "before_delete"): + event.listen(cls, evt, cls._block_write) + + @staticmethod + def _block_write(mapper, connection, target): + raise WriteToViewError(f"{type(target).__name__} is a read-only view.") diff --git a/src/db/models/views/__init__.py b/src/db/models/views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/has_agency_auto_suggestion.py b/src/db/models/views/has_agency_auto_suggestion.py new file mode 100644 index 00000000..c72b9fd3 --- /dev/null +++ b/src/db/models/views/has_agency_auto_suggestion.py @@ -0,0 +1,31 @@ +""" + CREATE OR REPLACE VIEW url_has_agency_auto_suggestions_view AS + SELECT + u.id as url_id, + (uas.id IS NOT NULL) AS has_agency_suggestions + FROM public.urls u + LEFT JOIN public.url_auto_agency_id_subtasks uas on u.id = uas.url_id +""" + + +from sqlalchemy import Column, Boolean, PrimaryKeyConstraint +from sqlalchemy.orm import Mapped + +from src.db.models.mixins import URLDependentMixin, ViewMixin +from src.db.models.templates_.base import Base + + +class HasAgencyAutoSuggestionView( + Base, + URLDependentMixin, + ViewMixin +): + + __tablename__ = "url_has_agency_auto_suggestions_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) + + has_agency_suggestions: Mapped[bool] = Column(Boolean, nullable=False) + diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py new file mode 100644 index 00000000..7289020f --- /dev/null +++ b/src/db/models/views/url_annotations_flags.py @@ -0,0 +1,49 @@ +""" +CREATE OR REPLACE VIEW url_annotation_flags AS +( +SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed +FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) +""" + +from sqlalchemy import PrimaryKeyConstraint, Column, Boolean + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class URLAnnotationFlagsView( + Base, + ViewMixin, + URLDependentMixin +): + __tablename__ = "url_annotation_flags" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) + + has_auto_record_type_suggestion = Column(Boolean, nullable=False) + has_auto_relevant_suggestion = Column(Boolean, nullable=False) + has_auto_agency_suggestion = Column(Boolean, nullable=False) + has_user_record_type_suggestion = Column(Boolean, nullable=False) + has_user_relevant_suggestion = Column(Boolean, nullable=False) + has_user_agency_suggestion = Column(Boolean, nullable=False) + has_confirmed_agency = Column(Boolean, nullable=False) + was_reviewed = Column(Boolean, nullable=False) \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index ec8e09bd..69e87219 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -2,21 +2,19 @@ from typing import Any from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement -from sqlalchemy.orm import aliased, selectinload +from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType +from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.task_url import LinkTaskURL -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.task.core import Task -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -78,18 +76,19 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: def exclude_urls_with_agency_suggestions( statement: Select ): - # Aliases for clarity - AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - - # Exclude if automated suggestions exist - statement = statement.where( - ~exists().where(AutomatedSuggestion.url_id == URL.id) - ) - # Exclude if confirmed agencies exist - statement = statement.where( - ~exists().where(LinkURLAgency.url_id == URL.id) - ) - return statement + raise NotImplementedError + # # Aliases for clarity + # AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) + # + # # Exclude if automated suggestions exist + # statement = statement.where( + # ~exists().where(AutomatedSuggestion.url_id == URL.id) + # ) + # # Exclude if confirmed agencies exist + # statement = statement.where( + # ~exists().where(LinkURLAgency.url_id == URL.id) + # ) + # return statement @staticmethod def pending_urls_missing_miscellaneous_metadata_query() -> Select: diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 66dd2e92..a6abb785 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -4,7 +4,11 @@ from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ + NLPLocationMatchResponse from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @@ -21,6 +25,12 @@ def __init__( ): self.access_manager = access_manager + async def search_agency_by_location( + self, + params: list[SearchAgencyByLocationParams] + ) -> list[SearchAgencyByLocationResponse]: + raise NotImplementedError + async def match_agency( self, name: str, diff --git a/src/external/pdap/dtos/search_agency_by_location/__init__.py b/src/external/pdap/dtos/search_agency_by_location/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py new file mode 100644 index 00000000..855c9a76 --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class SearchAgencyByLocationParams(BaseModel): + request_id: int + state_iso: str | None + locations: list[str] \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py new file mode 100644 index 00000000..7f786c89 --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + + +class SearchAgencyByLocationResult(BaseModel): + agency_id: int + similarity: float = Field(ge=0, le=1) + +class SearchAgencyByLocationResponse(BaseModel): + request_id: int + results: list[SearchAgencyByLocationResult] \ No newline at end of file diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 5b56fca3..9df2be52 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -103,6 +103,18 @@ def updated_at_column() -> sa.Column: comment='The last time the row was updated.' ) +def task_id_column() -> sa.Column: + return sa.Column( + 'task_id', + sa.Integer(), + sa.ForeignKey( + 'tasks.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `tasks` table.' + ) + def url_id_column(name: str = 'url_id') -> sa.Column: return sa.Column( name, diff --git a/tests/automated/integration/db/structure/test_view.py b/tests/automated/integration/db/structure/test_view.py new file mode 100644 index 00000000..08a5d57c --- /dev/null +++ b/tests/automated/integration/db/structure/test_view.py @@ -0,0 +1,70 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import BatchStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.db.models.exceptions import WriteToViewError +from src.db.models.impl.task.core import Task +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.views.has_agency_auto_suggestion import HasAgencyAutoSuggestionView + +@pytest.mark.asyncio +async def test_has_agency_auto_suggestion_view( + adb_client_test: AsyncDatabaseClient +) -> None: + """Test functionality of agency auto suggestion view and view logic in general.""" + + view_objects: list[HasAgencyAutoSuggestionView] = \ + await adb_client_test.get_all(HasAgencyAutoSuggestionView) + + assert len(view_objects) == 0 + + url = URL( + url="https://example.com/1", + status=URLStatus.OK, + source=URLSource.COLLECTOR + ) + url_id: int = await adb_client_test.add(url, return_id=True) + + view_objects: list[HasAgencyAutoSuggestionView] = \ + await adb_client_test.get_all(HasAgencyAutoSuggestionView) + + assert len(view_objects) == 1 + assert view_objects[0].url_id == url_id + assert view_objects[0].has_agency_suggestions is False + + + task = Task( + task_type=TaskType.HTML.value, + task_status=BatchStatus.READY_TO_LABEL, + ) + task_id: int = await adb_client_test.add(task, return_id=True) + + subtask = URLAutoAgencyIDSubtask( + task_id=task_id, + url_id=url_id, + subtask=AutoAgencyIDSubtaskType.CKAN, + agencies_found=False, + detail=SubtaskDetailCode.RETRIEVAL_ERROR + ) + await adb_client_test.add(subtask) + + view_objects: list[HasAgencyAutoSuggestionView] = \ + await adb_client_test.get_all(HasAgencyAutoSuggestionView) + + assert len(view_objects) == 1 + assert view_objects[0].url_id == url_id + assert view_objects[0].has_agency_suggestions is True + + + view_obj_to_add = HasAgencyAutoSuggestionView( + url_id=1, + has_agency_suggestions=True + ) + + with pytest.raises(WriteToViewError): + await adb_client_test.add(view_obj_to_add) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py index c7818e77..50748b7a 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py @@ -1,6 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion async def assert_expected_confirmed_and_auto_suggestions(adb_client: AsyncDatabaseClient): @@ -11,7 +10,8 @@ async def assert_expected_confirmed_and_auto_suggestions(adb_client: AsyncDataba assert len(confirmed_suggestions) == 3, f"Expected 3 confirmed suggestions, got {len(confirmed_suggestions)}" agencies = await adb_client.get_all(Agency) assert len(agencies) == 2 - auto_suggestions = await adb_client.get_all(AutomatedUrlAgencySuggestion) + raise NotImplementedError("Revise") + # auto_suggestions = await adb_client.get_all(AutomatedUrlAgencySuggestion) assert len(auto_suggestions) == 4, f"Expected 4 auto suggestions, got {len(auto_suggestions)}" # Of the auto suggestions, 2 should be unknown assert len([s for s in auto_suggestions if s.is_unknown]) == 2 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py index ff9898fe..a48cfc0c 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py @@ -3,15 +3,13 @@ import pytest from aiohttp import ClientSession -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ - MuckrockAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import \ + MuckrockAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.asserts import \ - assert_expected_confirmed_and_auto_suggestions from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -108,10 +106,10 @@ async def test_agency_identification_task( subtask_class_collector_type = [ - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_ALL_SEARCH), - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_COUNTY_SEARCH), - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_SIMPLE_SEARCH), - (CKANAgencyIdentificationSubtask, CollectorType.CKAN), + (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_ALL_SEARCH), + (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_COUNTY_SEARCH), + (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_SIMPLE_SEARCH), + (CKANAgencyIDSubtaskOperator, CollectorType.CKAN), (UnknownAgencyIdentificationSubtask, CollectorType.COMMON_CRAWLER), (UnknownAgencyIdentificationSubtask, CollectorType.AUTO_GOOGLER), (UnknownAgencyIdentificationSubtask, None) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py index 6a2e4fed..832ca7df 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py @@ -3,7 +3,7 @@ import pytest from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator from src.core.enums import SuggestionType from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo @@ -33,14 +33,14 @@ async def test_ckan_subtask(db_data_creator: DBDataCreator): ) # Assuming MatchAgencyResponse is a class # Create an instance of CKANAgencyIdentificationSubtask - task = CKANAgencyIdentificationSubtask(pdap_client) + task = CKANAgencyIDSubtaskOperator(pdap_client) # Call the run method with static values collector_metadata = {"agency_name": "Test Agency"} url_id = 1 # Call the run method - result = await task.run(url_id, collector_metadata) + result = await task.inner_logic(url_id, collector_metadata) # Check the result assert len(result) == 2 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py index 80f92ec4..f08db57c 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py @@ -7,7 +7,7 @@ from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import MuckrockAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @@ -47,13 +47,13 @@ async def test_muckrock_subtask(db_data_creator: DBDataCreator): ) # Create an instance of MuckrockAgencyIdentificationSubtask with mock dependencies - muckrock_agency_identification_subtask = MuckrockAgencyIdentificationSubtask( + muckrock_agency_identification_subtask = MuckrockAgencyIDSubtaskOperator( muckrock_api_interface=muckrock_api_interface_mock, pdap_client=pdap_client_mock ) # Run the subtask - results: list[URLAgencySuggestionInfo] = await muckrock_agency_identification_subtask.run( + results: list[URLAgencySuggestionInfo] = await muckrock_agency_identification_subtask.inner_logic( url_id=1, collector_metadata={ "agency": 123 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py index aab59dca..a2a32404 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py @@ -10,7 +10,7 @@ async def test_unknown_agency_identification_subtask(): # Test that no_collector subtask correctly adds URL to # url_agency_suggestions with label 'Unknown' subtask = UnknownAgencyIdentificationSubtask() - results: list[URLAgencySuggestionInfo] = await subtask.run(url_id=1, collector_metadata={}) + results: list[URLAgencySuggestionInfo] = await subtask.inner_logic(url_id=1, collector_metadata={}) assert len(results) == 1 assert results[0].url_id == 1 assert results[0].suggestion_type == SuggestionType.UNKNOWN \ No newline at end of file diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 630d0f71..e81c266d 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -8,5 +8,7 @@ def wipe_database(connection_string: str) -> None: engine = create_engine(connection_string) with engine.connect() as connection: for table in reversed(Base.metadata.sorted_tables): + if table.info == "view": + continue connection.execute(table.delete()) connection.commit() diff --git a/uv.lock b/uv.lock index 067bc37f..08a5ddf8 100644 --- a/uv.lock +++ b/uv.lock @@ -214,6 +214,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" }, ] +[[package]] +name = "blis" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/aa/0743c994884de83472c854bb534c9edab8d711e1880d4fa194e6d876bb60/blis-1.2.1.tar.gz", hash = "sha256:1066beedbedc2143c22bd28742658de05694afebacde8d8c2d14dd4b5a96765a", size = 2510297, upload_time = "2025-04-01T12:01:56.849Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/57/ae6596b1e27859886e0b81fb99497bcfff139895585a9e2284681c8a8846/blis-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:778c4f72b71f97187e3304acfbd30eab98c9ba1a5b03b65128bc3875400ae604", size = 6976808, upload_time = "2025-04-01T12:01:21.175Z" }, + { url = "https://files.pythonhosted.org/packages/ce/35/6225e6ad2bccf23ac124448d59112c098d63a8917462e9f73967bc217168/blis-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5c5f2ffb0ae9c1f5aaa95b9681bcdd9a777d007c501fa220796329b939ca2790", size = 1281913, upload_time = "2025-04-01T12:01:23.202Z" }, + { url = "https://files.pythonhosted.org/packages/7a/84/c6a6d1c0a8a00799d2ec5db05d676bd9a9b0472cac4d3eff2e2fd1953521/blis-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db4dc5d2d57106bb411633603a5c7d178a0845267c3efc7e5ea4fa7a44772976", size = 3104139, upload_time = "2025-04-01T12:01:24.781Z" }, + { url = "https://files.pythonhosted.org/packages/a5/6c/c5fab7ed1fe6e8bdcda732017400d1adc53db5b6dd2c2a6046acab91f4fa/blis-1.2.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c621271c2843101927407e052b35a67f853da59d5c74e9e070e982c7f82e2e04", size = 3304143, upload_time = "2025-04-01T12:01:27.363Z" }, + { url = "https://files.pythonhosted.org/packages/22/d1/85f03269886253758546fcfdbeddee7e717d843ea134596b60db9c2648c4/blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43f65f882250b817566d7543abd1f6da297f1662e5dd9936e14c04b88285a497", size = 11660080, upload_time = "2025-04-01T12:01:29.478Z" }, + { url = "https://files.pythonhosted.org/packages/78/c8/c81ed3036e8ce0d6ce0d19a032c7f3d69247f221c5357e18548dea9380d3/blis-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78a0613d559ccc426c101c67e8f84e1f93491e29d722c370872c538ee652bd07", size = 3133133, upload_time = "2025-04-01T12:01:31.537Z" }, + { url = "https://files.pythonhosted.org/packages/b8/42/7c296e04b979204777ecae2fe9287ac7b0255d8c4c2111d2a735c439b9d7/blis-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f5e32e5e5635fc7087b724b53120dbcd86201f56c0405882ce254bc0e493392", size = 4360695, upload_time = "2025-04-01T12:01:33.449Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ab/aa5c8dfd0068d2cc976830797dd092779259860f964286db05739154e3a7/blis-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d339c97cc83f53e39c1013d0dcd7d5278c853dc102d931132eeb05b226e28429", size = 14828081, upload_time = "2025-04-01T12:01:35.129Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c0/047fef3ac4a531903c52ba7c108fd608556627723bfef7554f040b10e556/blis-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:8d284323cc994e9b818c32046f1aa3e57bcc41c74e02daebdf0d3bc3e14355cb", size = 6232639, upload_time = "2025-04-01T12:01:37.268Z" }, + { url = "https://files.pythonhosted.org/packages/2f/f1/2aecd2447de0eb5deea3a13e471ab43e42e8561afe56a13d830f95c58909/blis-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1cd35e94a1a97b37b31b11f097f998a3a0e75ac06d57e6edf7d9597200f55756", size = 6989811, upload_time = "2025-04-01T12:01:39.013Z" }, + { url = "https://files.pythonhosted.org/packages/cf/39/4c097508f6b9ef7df27dd5ada0a175e8169f58cbe33d40a303a844abdaea/blis-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7b6394d27f2259c580df8d13ebe9c0a188a6ace0a689e93d6e49cb15018d4d9c", size = 1282669, upload_time = "2025-04-01T12:01:41.418Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8e/b8a5eafa9824fcc7f3339a283e910f7af110d749fd09f52e83f432124543/blis-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9c127159415dc772f345abc3575e1e2d02bb1ae7cb7f532267d67705be04c66", size = 3063750, upload_time = "2025-04-01T12:01:43.277Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7a/f88e935f2cd3ad52ef363beeddf9a537d5038e519aa7b09dc18c762fbb66/blis-1.2.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5f9fa589aa72448009fd5001afb05e69f3bc953fe778b44580fd7d79ee8201a1", size = 3260903, upload_time = "2025-04-01T12:01:44.815Z" }, + { url = "https://files.pythonhosted.org/packages/4a/26/283f1392974e5c597228f8485f45f89de33f2c85becebc25e846d0485e44/blis-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1aa6150259caf4fa0b527bfc8c1e858542f9ca88a386aa90b93e1ca4c2add6df", size = 11616588, upload_time = "2025-04-01T12:01:46.356Z" }, + { url = "https://files.pythonhosted.org/packages/fa/86/57047b688e42c92e35d0581ef9db15ee3bdf14deff4d9a2481ce331f2dae/blis-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3ba67c09883cae52da3d9e9d3f4305464efedd336032c4d5c6c429b27b16f4c1", size = 3072892, upload_time = "2025-04-01T12:01:48.314Z" }, + { url = "https://files.pythonhosted.org/packages/c7/db/85b6f5fa2a2515470cc5a2cbeaedd25aa465fa572801f18d14c24c9e5102/blis-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7d9c5fca21b01c4b2f3cb95b71ce7ef95e58b3b62f0d79d1f699178c72c1e03e", size = 4310005, upload_time = "2025-04-01T12:01:49.815Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ae/6e610e950476ebc9868a0207a827d67433ef65e2b14b837d317e60248e5a/blis-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6952a4a1f15e0d1f73cc1206bd71368b32551f2e94852dae288b50c4ea0daf31", size = 14790198, upload_time = "2025-04-01T12:01:52.601Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0e/353e29e8dd3d31bba25a3eabbbfb798d82bd19ca2d24fd00583b6d3992f3/blis-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:bd0360427b1669684cd35a8355be126d7a33992ccac6dcb1fbef5e100f4e3026", size = 6260640, upload_time = "2025-04-01T12:01:54.849Z" }, +] + [[package]] name = "boltons" version = "25.0.0" @@ -298,6 +327,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload_time = "2025-02-20T21:01:16.647Z" }, ] +[[package]] +name = "catalogue" +version = "2.0.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/b4/244d58127e1cdf04cf2dc7d9566f0d24ef01d5ce21811bab088ecc62b5ea/catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15", size = 19561, upload_time = "2023-09-25T06:29:24.962Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/96/d32b941a501ab566a16358d68b6eb4e4acc373fab3c3c4d7d9e649f7b4bb/catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f", size = 17325, upload_time = "2023-09-25T06:29:23.337Z" }, +] + [[package]] name = "certifi" version = "2025.4.26" @@ -384,6 +422,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/58/1f37bf81e3c689cc74ffa42102fa8915b59085f54a6e4a80bc6265c0f6bf/click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c", size = 102156, upload_time = "2025-05-10T22:21:01.352Z" }, ] +[[package]] +name = "cloudpathlib" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/bc/d7345595a4467144b9e0b32e5eda9e4633ea6e4982262b0696935adb2229/cloudpathlib-0.22.0.tar.gz", hash = "sha256:6c0cb0ceab4f66a3a05a84055f9318fb8316cae5e096819f3f8e4be64feab6e9", size = 52304, upload_time = "2025-08-30T05:20:04.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/72/e8e53d8232e801e040f4b557ff3a453cecbb630d53ae107bd5e66a206bb9/cloudpathlib-0.22.0-py3-none-any.whl", hash = "sha256:2fdfaf5c4f85810ae8374d336d04dee371914d0e41a984695ae67308d7a5a009", size = 61520, upload_time = "2025-08-30T05:20:03.232Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -393,6 +440,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "confection" +version = "0.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "srsly" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/d3/57c6631159a1b48d273b40865c315cf51f89df7a9d1101094ef12e3a37c2/confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e", size = 38924, upload_time = "2024-05-31T16:17:01.559Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/00/3106b1854b45bd0474ced037dfe6b73b90fe68a68968cef47c23de3d43d2/confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14", size = 35451, upload_time = "2024-05-31T16:16:59.075Z" }, +] + +[[package]] +name = "cymem" +version = "2.0.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/4a/1acd761fb6ac4c560e823ce40536a62f886f2d59b2763b5c3fc7e9d92101/cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd", size = 10346, upload_time = "2025-01-16T21:50:41.045Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/e3/d98e3976f4ffa99cddebc1ce379d4d62e3eb1da22285267f902c99cc3395/cymem-2.0.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3ee54039aad3ef65de82d66c40516bf54586287b46d32c91ea0530c34e8a2745", size = 42005, upload_time = "2025-01-16T21:49:34.977Z" }, + { url = "https://files.pythonhosted.org/packages/41/b4/7546faf2ab63e59befc95972316d62276cec153f7d4d60e7b0d5e08f0602/cymem-2.0.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c05ef75b5db217be820604e43a47ccbbafea98ab6659d07cea92fa3c864ea58", size = 41747, upload_time = "2025-01-16T21:49:36.108Z" }, + { url = "https://files.pythonhosted.org/packages/7d/4e/042f372e5b3eb7f5f3dd7677161771d301de2b6fa3f7c74e1cebcd502552/cymem-2.0.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d5381e5793ce531bac0dbc00829c8381f18605bb67e4b61d34f8850463da40", size = 217647, upload_time = "2025-01-16T21:49:37.433Z" }, + { url = "https://files.pythonhosted.org/packages/48/cb/2207679e4b92701f78cf141e1ab4f81f55247dbe154eb426b842a0a993de/cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b9d3f42d7249ac81802135cad51d707def058001a32f73fc7fbf3de7045ac7", size = 218857, upload_time = "2025-01-16T21:49:40.09Z" }, + { url = "https://files.pythonhosted.org/packages/31/7a/76ae3b7a39ab2531029d281e43fcfcaad728c2341b150a81a3a1f5587cf3/cymem-2.0.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:39b78f2195d20b75c2d465732f6b8e8721c5d4eb012777c2cb89bdb45a043185", size = 206148, upload_time = "2025-01-16T21:49:41.383Z" }, + { url = "https://files.pythonhosted.org/packages/25/f9/d0fc0191ac79f15638ddb59237aa76f234691374d7d7950e10f384bd8a25/cymem-2.0.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2203bd6525a80d8fd0c94654a263af21c0387ae1d5062cceaebb652bf9bad7bc", size = 207112, upload_time = "2025-01-16T21:49:43.986Z" }, + { url = "https://files.pythonhosted.org/packages/56/c8/75f75889401b20f4c3a7c5965dda09df42913e904ddc2ffe7ef3bdf25061/cymem-2.0.11-cp311-cp311-win_amd64.whl", hash = "sha256:aa54af7314de400634448da1f935b61323da80a49484074688d344fb2036681b", size = 39360, upload_time = "2025-01-16T21:49:45.479Z" }, + { url = "https://files.pythonhosted.org/packages/71/67/0d74f7e9d79f934368a78fb1d1466b94bebdbff14f8ae94dd3e4ea8738bb/cymem-2.0.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a0fbe19ce653cd688842d81e5819dc63f911a26e192ef30b0b89f0ab2b192ff2", size = 42621, upload_time = "2025-01-16T21:49:46.585Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d6/f7a19c63b48efc3f00a3ee8d69070ac90202e1e378f6cf81b8671f0cf762/cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de72101dc0e6326f6a2f73e05a438d1f3c6110d41044236d0fbe62925091267d", size = 42249, upload_time = "2025-01-16T21:49:48.973Z" }, + { url = "https://files.pythonhosted.org/packages/d7/60/cdc434239813eef547fb99b6d0bafe31178501702df9b77c4108c9a216f6/cymem-2.0.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee4395917f6588b8ac1699499128842768b391fe8896e8626950b4da5f9a406", size = 224758, upload_time = "2025-01-16T21:49:51.382Z" }, + { url = "https://files.pythonhosted.org/packages/1d/68/8fa6efae17cd3b2ba9a2f83b824867c5b65b06f7aec3f8a0d0cabdeffb9b/cymem-2.0.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02f2b17d760dc3fe5812737b1ce4f684641cdd751d67761d333a3b5ea97b83", size = 227995, upload_time = "2025-01-16T21:49:54.538Z" }, + { url = "https://files.pythonhosted.org/packages/e4/f3/ceda70bf6447880140602285b7c6fa171cb7c78b623d35345cc32505cd06/cymem-2.0.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:04ee6b4041ddec24512d6e969ed6445e57917f01e73b9dabbe17b7e6b27fef05", size = 215325, upload_time = "2025-01-16T21:49:57.229Z" }, + { url = "https://files.pythonhosted.org/packages/d3/47/6915eaa521e1ce7a0ba480eecb6870cb4f681bcd64ced88c2f0ed7a744b4/cymem-2.0.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e1048dae7e627ee25f22c87bb670b13e06bc0aecc114b89b959a798d487d1bf4", size = 216447, upload_time = "2025-01-16T21:50:00.432Z" }, + { url = "https://files.pythonhosted.org/packages/7b/be/8e02bdd31e557f642741a06c8e886782ef78f0b00daffd681922dc9bbc88/cymem-2.0.11-cp312-cp312-win_amd64.whl", hash = "sha256:0c269c7a867d74adeb9db65fa1d226342aacf44d64b7931282f0b0eb22eb6275", size = 39283, upload_time = "2025-01-16T21:50:03.384Z" }, + { url = "https://files.pythonhosted.org/packages/bd/90/b064e2677e27a35cf3605146abc3285d4f599cc1b6c18fc445ae876dd1e3/cymem-2.0.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4a311c82f743275c84f708df89ac5bf60ddefe4713d532000c887931e22941f", size = 42389, upload_time = "2025-01-16T21:50:05.925Z" }, + { url = "https://files.pythonhosted.org/packages/fd/60/7aa0561a6c1f0d42643b02c4fdeb2a16181b0ff4e85d73d2d80c6689e92a/cymem-2.0.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:02ed92bead896cca36abad00502b14fa651bdf5d8319461126a2d5ac8c9674c5", size = 41948, upload_time = "2025-01-16T21:50:08.375Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4e/88a29cc5575374982e527b4ebcab3781bdc826ce693c6418a0f836544246/cymem-2.0.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44ddd3588379f8f376116384af99e3fb5f90091d90f520c341942618bf22f05e", size = 219382, upload_time = "2025-01-16T21:50:13.089Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/8f96e167e93b7f7ec105ed7b25c77bbf215d15bcbf4a24082cdc12234cd6/cymem-2.0.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87ec985623624bbd298762d8163fc194a096cb13282731a017e09ff8a60bb8b1", size = 222974, upload_time = "2025-01-16T21:50:17.969Z" }, + { url = "https://files.pythonhosted.org/packages/6a/fc/ce016bb0c66a4776345fac7508fddec3b739b9dd4363094ac89cce048832/cymem-2.0.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3385a47285435848e0ed66cfd29b35f3ed8703218e2b17bd7a0c053822f26bf", size = 213426, upload_time = "2025-01-16T21:50:19.349Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c8/accf7cc768f751447a5050b14a195af46798bc22767ac25f49b02861b1eb/cymem-2.0.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5461e65340d6572eb64deadce79242a446a1d39cb7bf70fe7b7e007eb0d799b0", size = 219195, upload_time = "2025-01-16T21:50:21.407Z" }, + { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload_time = "2025-01-16T21:50:24.239Z" }, +] + [[package]] name = "data-source-identification" version = "0.1.0" @@ -427,6 +516,7 @@ dependencies = [ { name = "python-dotenv" }, { name = "requests" }, { name = "side-effects" }, + { name = "spacy" }, { name = "sqlalchemy" }, { name = "starlette" }, { name = "tqdm" }, @@ -476,6 +566,7 @@ requires-dist = [ { name = "python-dotenv", specifier = "~=1.0.1" }, { name = "requests", specifier = "~=2.32.3" }, { name = "side-effects", specifier = ">=1.6.dev0" }, + { name = "spacy", specifier = ">=3.8.7" }, { name = "sqlalchemy", specifier = "~=2.0.36" }, { name = "starlette", specifier = "~=0.45.3" }, { name = "tqdm", specifier = ">=4.64.1" }, @@ -1069,6 +1160,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/47/3729f00f35a696e68da15d64eb9283c330e776f3b5789bac7f2c0c4df209/jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf", size = 206867, upload_time = "2025-03-10T21:36:25.843Z" }, ] +[[package]] +name = "langcodes" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "language-data" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/7a/5a97e327063409a5caa21541e6d08ae4a0f2da328447e9f2c7b39e179226/langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801", size = 191030, upload_time = "2024-11-19T10:23:45.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/6b/068c2ea7a712bf805c62445bd9e9c06d7340358ef2824150eceac027444b/langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33", size = 182974, upload_time = "2024-11-19T10:23:42.824Z" }, +] + +[[package]] +name = "language-data" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "marisa-trie" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/ce/3f144716a9f2cbf42aa86ebc8b085a184be25c80aa453eea17c294d239c1/language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec", size = 5129310, upload_time = "2024-11-19T10:21:37.912Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/e9/5a5ffd9b286db82be70d677d0a91e4d58f7912bb8dd026ddeeb4abe70679/language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf", size = 5385760, upload_time = "2024-11-19T10:21:36.005Z" }, +] + [[package]] name = "lxml" version = "5.1.1" @@ -1107,6 +1222,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload_time = "2025-04-10T12:50:53.297Z" }, ] +[[package]] +name = "marisa-trie" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c5/e3/c9066e74076b90f9701ccd23d6a0b8c1d583feefdec576dc3e1bb093c50d/marisa_trie-1.3.1.tar.gz", hash = "sha256:97107fd12f30e4f8fea97790343a2d2d9a79d93697fe14e1b6f6363c984ff85b", size = 212454, upload_time = "2025-08-26T15:13:18.401Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/bf/2f1fe6c9fcd2b509c6dfaaf26e35128947d6d3718d0b39510903c55b7bed/marisa_trie-1.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ef045f694ef66079b4e00c4c9063a00183d6af7d1ff643de6ea5c3b0d9af01b", size = 174027, upload_time = "2025-08-26T15:12:01.434Z" }, + { url = "https://files.pythonhosted.org/packages/a9/5a/de7936d58ed0de847180cee2b95143d420223c5ade0c093d55113f628237/marisa_trie-1.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cbd28f95d5f30d9a7af6130869568e75bfd7ef2e0adfb1480f1f44480f5d3603", size = 158478, upload_time = "2025-08-26T15:12:02.429Z" }, + { url = "https://files.pythonhosted.org/packages/48/cc/80611aadefcd0bcf8cd1795cb4643bb27213319a221ba04fe071da0b75cd/marisa_trie-1.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b173ec46d521308f7c97d96d6e05cf2088e0548f82544ec9a8656af65593304d", size = 1257535, upload_time = "2025-08-26T15:12:04.271Z" }, + { url = "https://files.pythonhosted.org/packages/36/89/c4eeefb956318047036e6bdc572b6112b2059d595e85961267a90aa40458/marisa_trie-1.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:954fef9185f8a79441b4e433695116636bf66402945cfee404f8983bafa59788", size = 1275566, upload_time = "2025-08-26T15:12:05.874Z" }, + { url = "https://files.pythonhosted.org/packages/c4/63/d775a2fdfc4b555120381cd2aa6dff1845576bc14fb13796ae1b1e8dbaf7/marisa_trie-1.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ca644534f15f85bba14c412afc17de07531e79a766ce85b8dbf3f8b6e7758f20", size = 2199831, upload_time = "2025-08-26T15:12:07.175Z" }, + { url = "https://files.pythonhosted.org/packages/50/aa/e5053927dc3cac77acc9b27f6f87e75c880f5d3d5eac9111fe13b1d8bf6f/marisa_trie-1.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3834304fdeaa1c9b73596ad5a6c01a44fc19c13c115194704b85f7fbdf0a7b8e", size = 2283830, upload_time = "2025-08-26T15:12:08.319Z" }, + { url = "https://files.pythonhosted.org/packages/71/3e/e314906d0de5b1a44780a23c79bb62a9aafd876e2a4e80fb34f58c721da4/marisa_trie-1.3.1-cp311-cp311-win32.whl", hash = "sha256:70b4c96f9119cfeb4dc6a0cf4afc9f92f0b002cde225bcd910915d976c78e66a", size = 117335, upload_time = "2025-08-26T15:12:09.776Z" }, + { url = "https://files.pythonhosted.org/packages/b0/2b/85623566621135de3d57497811f94679b4fb2a8f16148ef67133c2abab7a/marisa_trie-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:986eaf35a7f63c878280609ecd37edf8a074f7601c199acfec81d03f1ee9a39a", size = 143985, upload_time = "2025-08-26T15:12:10.988Z" }, + { url = "https://files.pythonhosted.org/packages/3f/40/ee7ea61b88d62d2189b5c4a27bc0fc8d9c32f8b8dc6daf1c93a7b7ad34ac/marisa_trie-1.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5b7c1e7fa6c3b855e8cfbabf38454d7decbaba1c567d0cd58880d033c6b363bd", size = 173454, upload_time = "2025-08-26T15:12:12.13Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fc/58635811586898041004b2197a085253706ede211324a53ec01612a50e20/marisa_trie-1.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c12b44c190deb0d67655021da1f2d0a7d61a257bf844101cf982e68ed344f28d", size = 155305, upload_time = "2025-08-26T15:12:13.374Z" }, + { url = "https://files.pythonhosted.org/packages/fe/98/88ca0c98d37034a3237acaf461d210cbcfeb6687929e5ba0e354971fa3ed/marisa_trie-1.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9688c7b45f744366a4ef661e399f24636ebe440d315ab35d768676c59c613186", size = 1244834, upload_time = "2025-08-26T15:12:14.795Z" }, + { url = "https://files.pythonhosted.org/packages/f3/5f/93b3e3607ccd693a768eafee60829cd14ea1810b75aa48e8b20e27b332c4/marisa_trie-1.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99a00cab4cf9643a87977c87a5c8961aa44fff8d5dd46e00250135f686e7dedf", size = 1265148, upload_time = "2025-08-26T15:12:16.229Z" }, + { url = "https://files.pythonhosted.org/packages/db/6e/051d7d25c7fb2b3df605c8bd782513ebbb33fddf3bae6cf46cf268cca89f/marisa_trie-1.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:83efc045fc58ca04c91a96c9b894d8a19ac6553677a76f96df01ff9f0405f53d", size = 2172726, upload_time = "2025-08-26T15:12:18.467Z" }, + { url = "https://files.pythonhosted.org/packages/58/da/244d9d4e414ce6c73124cba4cc293dd140bf3b04ca18dec64c2775cca951/marisa_trie-1.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0b9816ab993001a7854b02a7daec228892f35bd5ab0ac493bacbd1b80baec9f1", size = 2256104, upload_time = "2025-08-26T15:12:20.168Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f1/1a36ecd7da6668685a7753522af89a19928ffc80f1cc1dbc301af216f011/marisa_trie-1.3.1-cp312-cp312-win32.whl", hash = "sha256:c785fd6dae9daa6825734b7b494cdac972f958be1f9cb3fb1f32be8598d2b936", size = 115624, upload_time = "2025-08-26T15:12:21.233Z" }, + { url = "https://files.pythonhosted.org/packages/35/b2/aabd1c9f1c102aa31d66633ed5328c447be166e0a703f9723e682478fd83/marisa_trie-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:9868b7a8e0f648d09ffe25ac29511e6e208cc5fb0d156c295385f9d5dc2a138e", size = 138562, upload_time = "2025-08-26T15:12:22.632Z" }, + { url = "https://files.pythonhosted.org/packages/46/a2/8331b995c1b3eee83aa745f4a6502d737ec523d5955a48f167d4177db105/marisa_trie-1.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9de573d933db4753a50af891bcb3ffbfe14e200406214c223aa5dfe2163f316d", size = 172272, upload_time = "2025-08-26T15:12:24.016Z" }, + { url = "https://files.pythonhosted.org/packages/97/b8/7b9681b5c0ea1bb950f907a4e3919eb7f7b7b3febafaae346f3b3f199f6f/marisa_trie-1.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f4bae4f920f2a1082eaf766c1883df7da84abdf333bafa15b8717c10416a615e", size = 154671, upload_time = "2025-08-26T15:12:25.013Z" }, + { url = "https://files.pythonhosted.org/packages/ca/16/929c1f83fdcff13f8d08500f434aaa18c21c8168d16cf81585d69085e980/marisa_trie-1.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf9f2b97fcfd5e2dbb0090d0664023872dcde990df0b545eca8d0ce95795a409", size = 1238754, upload_time = "2025-08-26T15:12:26.217Z" }, + { url = "https://files.pythonhosted.org/packages/0f/0a/b0e04d3ef91a87d4c7ea0b66c004fdfc6e65c9ed83edaebecfb482dfe0ed/marisa_trie-1.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecdb19d33b26738a32602ef432b06cc6deeca4b498ce67ba8e5e39c8a7c19745", size = 1262653, upload_time = "2025-08-26T15:12:27.422Z" }, + { url = "https://files.pythonhosted.org/packages/de/1f/0ecf610ddc9a209ee63116baabb47584d5b8ecd01610091a593d9429537e/marisa_trie-1.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7416f1a084eb889c5792c57317875aeaa86abfe0bdc6f167712cebcec1d36ee", size = 2172399, upload_time = "2025-08-26T15:12:28.926Z" }, + { url = "https://files.pythonhosted.org/packages/ac/74/6b47deff3b3920449c135b9187c80f0d656adcdc5d41463745a61b012ea1/marisa_trie-1.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee428575377e29c636f2b4b3b0488875dcea310c6c5b3412ec4ef997f7bb37cc", size = 2255138, upload_time = "2025-08-26T15:12:30.271Z" }, + { url = "https://files.pythonhosted.org/packages/bd/fa/3dbcbe93dfaa626a5b3e741e7bcf3d7389aa5777175213bd8d9a9d3c992d/marisa_trie-1.3.1-cp313-cp313-win32.whl", hash = "sha256:d0f87bdf660f01e88ab3a507955697b2e3284065afa0b94fc9e77d6ad153ed5e", size = 115391, upload_time = "2025-08-26T15:12:31.465Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ce/ddfab303646b21aef07ff9dbc83fba92e5d493f49d3bc03d899ffd45c86f/marisa_trie-1.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:a83f5f7ae3494e0cc25211296252b1b86901c788ed82c83adda19d0c98f828d6", size = 139130, upload_time = "2025-08-26T15:12:32.4Z" }, + { url = "https://files.pythonhosted.org/packages/5a/1e/734b618048ad05c50cb1673ce2c6e836dc38ddeeeb011ed1804af07327a4/marisa_trie-1.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a850b151bd1e3a5d9afef113adc22727d696603659d575d7e84f994bd8d04bf1", size = 175131, upload_time = "2025-08-26T15:12:33.728Z" }, + { url = "https://files.pythonhosted.org/packages/d3/78/c7051147cc918cb8ff4a2920e11a9b17d9dcb4d8fc122122694b486e2bfe/marisa_trie-1.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9dc61fb8f8993589544f6df268229c6cf0a56ad4ed3e8585a9cd23c5ad79527b", size = 163094, upload_time = "2025-08-26T15:12:35.312Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b8/3b904178d7878319aacaabae5131c1f281519aaac0f8c68c8ed312912ccf/marisa_trie-1.3.1-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4bd41a6e73c0d0adafe4de449b6d35530a4ce6a836a6ee839baf117785ecfd7", size = 1279812, upload_time = "2025-08-26T15:12:36.831Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bf/e77a1284247b980560b4104bbdd5d06ed2c2ae3d56ab954f97293b6dbbcd/marisa_trie-1.3.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c8b2386d2d22c57880ed20a913ceca86363765623175671137484a7d223f07a", size = 1285690, upload_time = "2025-08-26T15:12:38.754Z" }, + { url = "https://files.pythonhosted.org/packages/48/82/f6f10db5ec72de2642499f3a6e4e8607bbd2cfb28269ea08d0d8ddac3313/marisa_trie-1.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9c56001badaf1779afae5c24b7ab85938644ab8ef3c5fd438ab5d49621b84482", size = 2197943, upload_time = "2025-08-26T15:12:40.584Z" }, + { url = "https://files.pythonhosted.org/packages/2a/d0/74b6c3011b1ebf4a8131430156b14c3af694082cf34c392fff766096fd4b/marisa_trie-1.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83a3748088d117a9b15d8981c947df9e4f56eb2e4b5456ae34fe1f83666c9185", size = 2280132, upload_time = "2025-08-26T15:12:42.059Z" }, + { url = "https://files.pythonhosted.org/packages/28/b2/b8b0cb738fa3ab07309ed92025c6e1b278f84c7255e976921a52b30d8d1b/marisa_trie-1.3.1-cp313-cp313t-win32.whl", hash = "sha256:137010598d8cebc53dbfb7caf59bde96c33a6af555e3e1bdbf30269b6a157e1e", size = 126446, upload_time = "2025-08-26T15:12:43.339Z" }, + { url = "https://files.pythonhosted.org/packages/b6/c6/2381648d0c946556ef51c673397cea40712d945444ceed0a0a0b51a174d2/marisa_trie-1.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:ec633e108f277f2b7f4671d933a909f39bba549910bf103e2940b87a14da2783", size = 153885, upload_time = "2025-08-26T15:12:44.309Z" }, + { url = "https://files.pythonhosted.org/packages/40/8a/590f25a281e08879791aabec7b8584c7934ff3d5f9d52859197d587246ec/marisa_trie-1.3.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:389721481c14a92fa042e4b91ae065bff13e2bc567c85a10aa9d9de80aaa8622", size = 172803, upload_time = "2025-08-26T15:12:45.342Z" }, + { url = "https://files.pythonhosted.org/packages/20/7f/fd19a4aa57ad169d08e518a6ee2438e7e77bfba7786c59f65891db69d202/marisa_trie-1.3.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e6f3b45def6ff23e254eeaa9079267004f0069d0a34eba30a620780caa4f2cb", size = 155506, upload_time = "2025-08-26T15:12:46.701Z" }, + { url = "https://files.pythonhosted.org/packages/e3/05/857832b8fe6b2ec441de1154eadc66dee067ce5fb6673c3ee0b8616108ee/marisa_trie-1.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a96ef3e461ecc85ec7d2233ddc449ff5a3fbdc520caea752bc5bc8faa975231", size = 1239979, upload_time = "2025-08-26T15:12:47.943Z" }, + { url = "https://files.pythonhosted.org/packages/4c/08/f9ea8b720a627d54e8e19f19a0ec1cc2011e01aa2b4f40d078e7f5e9e21f/marisa_trie-1.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5370f9ef6c008e502537cc1ff518c80ddf749367ce90179efa0e7f6275903a76", size = 1255705, upload_time = "2025-08-26T15:12:49.24Z" }, + { url = "https://files.pythonhosted.org/packages/e9/c3/42360fb38cdfde5db1783e2d7cfeb8b91eea837f89ef678f308ee026d794/marisa_trie-1.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0dcd42774e367ceb423c211a4fc8e7ce586acfaf0929c9c06d98002112075239", size = 2175092, upload_time = "2025-08-26T15:12:50.602Z" }, + { url = "https://files.pythonhosted.org/packages/09/ba/215b0d821fd37cdc600e834a75708aa2e117124dcf495c9a6c6dc7fdcb6b/marisa_trie-1.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3e2a0e1be95237981bd375a388f44b33d69ea5669a2f79fea038e45fff326595", size = 2250454, upload_time = "2025-08-26T15:12:52.435Z" }, + { url = "https://files.pythonhosted.org/packages/f5/a3/292ab31a12ec1cb356e6bc8b9cc8aaec920aa892a805757c011d77e8cd93/marisa_trie-1.3.1-cp314-cp314-win32.whl", hash = "sha256:c7a33506d0451112911c69f38d55da3e0e050f2be0ea4e5176865cf03baf26a9", size = 119101, upload_time = "2025-08-26T15:12:53.615Z" }, + { url = "https://files.pythonhosted.org/packages/95/83/0ea5de53209993cf301dd9d18d4cb22c20c84c753b4357b66660a8b9eb48/marisa_trie-1.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:68678816818efcd4a1787b557af81f215b989ec88680a86c85c34c914d413690", size = 142886, upload_time = "2025-08-26T15:12:54.835Z" }, + { url = "https://files.pythonhosted.org/packages/37/00/c7e063867988067992a9d9d2aceaede0be7787ca6d77ef34f2eca9d2708e/marisa_trie-1.3.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9e467e13971c64db6aed8afe4c2a131c3f73f048bec3f788a6141216acda598d", size = 175163, upload_time = "2025-08-26T15:12:55.908Z" }, + { url = "https://files.pythonhosted.org/packages/5f/64/eaf49d10c8506ecd717bbbeda907e474842c298354a444b875741ef4a0d9/marisa_trie-1.3.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:076731f79f8603cb3216cb6e5bbbc56536c89f63f175ad47014219ecb01e5996", size = 163119, upload_time = "2025-08-26T15:12:58.054Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/f24dd9c98ce6fc8c8d554b556e1c43f326c5df414b79aba33bd7d2d2fbfd/marisa_trie-1.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82de2de90488d0fbbf74cf9f20e1afd62e320693b88f5e9565fc80b28f5bbad3", size = 1277783, upload_time = "2025-08-26T15:12:59.225Z" }, + { url = "https://files.pythonhosted.org/packages/b2/1a/efd63e75d1374e08f8ebe2e15ff1b1ed5f6d5cf57614a5b0884bd9c882ee/marisa_trie-1.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c2bc6bee737f4d47fce48c5b03a7bd3214ef2d83eb5c9f84210091370a5f195", size = 1282309, upload_time = "2025-08-26T15:13:00.797Z" }, + { url = "https://files.pythonhosted.org/packages/33/4c/0cefa1eceec7858766af5939979857ac079c6c5251e00c6991c1a26bb1b7/marisa_trie-1.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:56043cf908ddf3d7364498085dbc2855d4ea8969aff3bf2439a79482a79e68e2", size = 2196594, upload_time = "2025-08-26T15:13:02.158Z" }, + { url = "https://files.pythonhosted.org/packages/bb/64/900f4132fc345be4b40073e66284707afa4cc203d8d0f1fe78c6b111cd47/marisa_trie-1.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9651daa1fdc471df5a5fa6a4833d3b01e76ac512eea141a5995681aebac5555f", size = 2277730, upload_time = "2025-08-26T15:13:03.528Z" }, + { url = "https://files.pythonhosted.org/packages/62/ab/6d6cf25a5c8835589a601a9a916ec5cdee740e277fed8ee620df546834bb/marisa_trie-1.3.1-cp314-cp314t-win32.whl", hash = "sha256:c6571462417cda2239b1ade86ceaf3852da9b52c6286046e87d404afc6da20a7", size = 131409, upload_time = "2025-08-26T15:13:05.106Z" }, + { url = "https://files.pythonhosted.org/packages/9a/61/c4efc044141429e67e8fd5536be86d76303f250179c7f92b2cc0c72e8d0b/marisa_trie-1.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:9e6496bbad3068e3bbbb934b1e1307bf1a9cb4609f9ec47b57e8ea37f1b5ee40", size = 162564, upload_time = "2025-08-26T15:13:06.112Z" }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -1281,6 +1452,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload_time = "2024-01-28T18:52:31.981Z" }, ] +[[package]] +name = "murmurhash" +version = "1.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/e9/02efbc6dfc2dd2085da3daacf9a8c17e8356019eceaedbfa21555e32d2af/murmurhash-1.0.13.tar.gz", hash = "sha256:737246d41ee00ff74b07b0bd1f0888be304d203ce668e642c86aa64ede30f8b7", size = 13258, upload_time = "2025-05-22T12:35:57.019Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/d1/9d13a02d9c8bfff10b1f68d19df206eaf2a8011defeccf7eb05ea0b8c54e/murmurhash-1.0.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b20d168370bc3ce82920121b78ab35ae244070a9b18798f4a2e8678fa03bd7e0", size = 26410, upload_time = "2025-05-22T12:35:20.786Z" }, + { url = "https://files.pythonhosted.org/packages/14/b0/3ee762e98cf9a8c2df9c8b377c326f3dd4495066d4eace9066fca46eba7a/murmurhash-1.0.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cef667d2e83bdceea3bc20c586c491fa442662ace1aea66ff5e3a18bb38268d8", size = 26679, upload_time = "2025-05-22T12:35:21.808Z" }, + { url = "https://files.pythonhosted.org/packages/39/06/24618f79cd5aac48490932e50263bddfd1ea90f7123d49bfe806a5982675/murmurhash-1.0.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:507148e50929ba1fce36898808573b9f81c763d5676f3fc6e4e832ff56b66992", size = 125970, upload_time = "2025-05-22T12:35:23.222Z" }, + { url = "https://files.pythonhosted.org/packages/e8/09/0e7afce0a422692506c85474a26fb3a03c1971b2b5f7e7745276c4b3de7f/murmurhash-1.0.13-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d50f6173d266ad165beb8bca6101d824217fc9279f9e9981f4c0245c1e7ee6", size = 123390, upload_time = "2025-05-22T12:35:24.303Z" }, + { url = "https://files.pythonhosted.org/packages/22/4c/c98f579b1a951b2bcc722a35270a2eec105c1e21585c9b314a02079e3c4d/murmurhash-1.0.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0f272e15a84a8ae5f8b4bc0a68f9f47be38518ddffc72405791178058e9d019a", size = 124007, upload_time = "2025-05-22T12:35:25.446Z" }, + { url = "https://files.pythonhosted.org/packages/df/f8/1b0dcebc8df8e091341617102b5b3b97deb6435f345b84f75382c290ec2c/murmurhash-1.0.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9423e0b0964ed1013a06c970199538c7ef9ca28c0be54798c0f1473a6591761", size = 123705, upload_time = "2025-05-22T12:35:26.709Z" }, + { url = "https://files.pythonhosted.org/packages/79/17/f2a38558e150a0669d843f75e128afb83c1a67af41885ea2acb940e18e2a/murmurhash-1.0.13-cp311-cp311-win_amd64.whl", hash = "sha256:83b81e7084b696df3d853f2c78e0c9bda6b285d643f923f1a6fa9ab145d705c5", size = 24572, upload_time = "2025-05-22T12:35:30.38Z" }, + { url = "https://files.pythonhosted.org/packages/e1/53/56ce2d8d4b9ab89557cb1d00ffce346b80a2eb2d8c7944015e5c83eacdec/murmurhash-1.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbe882e46cb3f86e092d8a1dd7a5a1c992da1ae3b39f7dd4507b6ce33dae7f92", size = 26859, upload_time = "2025-05-22T12:35:31.815Z" }, + { url = "https://files.pythonhosted.org/packages/f8/85/3a0ad54a61257c31496545ae6861515d640316f93681d1dd917e7be06634/murmurhash-1.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:52a33a12ecedc432493692c207c784b06b6427ffaa897fc90b7a76e65846478d", size = 26900, upload_time = "2025-05-22T12:35:34.267Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cd/6651de26744b50ff11c79f0c0d41244db039625de53c0467a7a52876b2d8/murmurhash-1.0.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:950403a7f0dc2d9c8d0710f07c296f2daab66299d9677d6c65d6b6fa2cb30aaa", size = 131367, upload_time = "2025-05-22T12:35:35.258Z" }, + { url = "https://files.pythonhosted.org/packages/50/6c/01ded95ddce33811c9766cae4ce32e0a54288da1d909ee2bcaa6ed13b9f1/murmurhash-1.0.13-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fde9fb5d2c106d86ff3ef2e4a9a69c2a8d23ba46e28c6b30034dc58421bc107b", size = 128943, upload_time = "2025-05-22T12:35:36.358Z" }, + { url = "https://files.pythonhosted.org/packages/ab/27/e539a9622d7bea3ae22706c1eb80d4af80f9dddd93b54d151955c2ae4011/murmurhash-1.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3aa55d62773745616e1ab19345dece122f6e6d09224f7be939cc5b4c513c8473", size = 129108, upload_time = "2025-05-22T12:35:37.864Z" }, + { url = "https://files.pythonhosted.org/packages/7a/84/18af5662e07d06839ad4db18ce026e6f8ef850d7b0ba92817b28dad28ba6/murmurhash-1.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:060dfef1b405cf02c450f182fb629f76ebe7f79657cced2db5054bc29b34938b", size = 129175, upload_time = "2025-05-22T12:35:38.928Z" }, + { url = "https://files.pythonhosted.org/packages/fe/8d/b01d3ee1f1cf3957250223b7c6ce35454f38fbf4abe236bf04a3f769341d/murmurhash-1.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:a8e79627d44a6e20a6487effc30bfe1c74754c13d179106e68cc6d07941b022c", size = 24869, upload_time = "2025-05-22T12:35:40.035Z" }, + { url = "https://files.pythonhosted.org/packages/00/b4/8919dfdc4a131ad38a57b2c5de69f4bd74538bf546637ee59ebaebe6e5a4/murmurhash-1.0.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8a7f8befd901379b6dc57a9e49c5188454113747ad6aa8cdd951a6048e10790", size = 26852, upload_time = "2025-05-22T12:35:41.061Z" }, + { url = "https://files.pythonhosted.org/packages/b4/32/ce78bef5d6101568bcb12f5bb5103fabcbe23723ec52e76ff66132d5dbb7/murmurhash-1.0.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f741aab86007510199193eee4f87c5ece92bc5a6ca7d0fe0d27335c1203dface", size = 26900, upload_time = "2025-05-22T12:35:42.097Z" }, + { url = "https://files.pythonhosted.org/packages/0c/4c/0f47c0b4f6b31a1de84d65f9573832c78cd47b4b8ce25ab5596a8238d150/murmurhash-1.0.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82614f18fa6d9d83da6bb0918f3789a3e1555d0ce12c2548153e97f79b29cfc9", size = 130033, upload_time = "2025-05-22T12:35:43.113Z" }, + { url = "https://files.pythonhosted.org/packages/e0/cb/e47233e32fb792dcc9fb18a2cf65f795d47179b29c2b4a2034689f14c707/murmurhash-1.0.13-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91f22a48b9454712e0690aa0b76cf0156a5d5a083d23ec7e209cfaeef28f56ff", size = 130619, upload_time = "2025-05-22T12:35:44.229Z" }, + { url = "https://files.pythonhosted.org/packages/8f/f1/f89911bf304ba5d385ccd346cc7fbb1c1450a24f093b592c3bfe87768467/murmurhash-1.0.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c4bc7938627b8fcb3d598fe6657cc96d1e31f4eba6a871b523c1512ab6dacb3e", size = 127643, upload_time = "2025-05-22T12:35:45.369Z" }, + { url = "https://files.pythonhosted.org/packages/a4/24/262229221f6840c1a04a46051075e99675e591571abcca6b9a8b6aa1602b/murmurhash-1.0.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58a61f1fc840f9ef704e638c39b8517bab1d21f1a9dbb6ba3ec53e41360e44ec", size = 127981, upload_time = "2025-05-22T12:35:46.503Z" }, + { url = "https://files.pythonhosted.org/packages/18/25/addbc1d28f83252732ac3e57334d42f093890b4c2cce483ba01a42bc607c/murmurhash-1.0.13-cp313-cp313-win_amd64.whl", hash = "sha256:c451a22f14c2f40e7abaea521ee24fa0e46fbec480c4304c25c946cdb6e81883", size = 24880, upload_time = "2025-05-22T12:35:47.625Z" }, +] + [[package]] name = "numpy" version = "1.26.4" @@ -1468,6 +1668,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload_time = "2024-04-20T21:34:40.434Z" }, ] +[[package]] +name = "preshed" +version = "3.0.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cymem" }, + { name = "murmurhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/3a/db814f67a05b6d7f9c15d38edef5ec9b21415710705b393883de92aee5ef/preshed-3.0.10.tar.gz", hash = "sha256:5a5c8e685e941f4ffec97f1fbf32694b8107858891a4bc34107fac981d8296ff", size = 15039, upload_time = "2025-05-26T15:18:33.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/99/c3709638f687da339504d1daeca48604cadb338bf3556a1484d1f0cd95e6/preshed-3.0.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d96c4fe2b41c1cdcc8c4fc1fdb10f922a6095c0430a3ebe361fe62c78902d068", size = 131486, upload_time = "2025-05-26T15:17:52.231Z" }, + { url = "https://files.pythonhosted.org/packages/e0/27/0fd36b63caa8bbf57b31a121d9565d385bbd7521771d4eb93e17d326873d/preshed-3.0.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb01ea930b96f3301526a2ab26f41347d07555e4378c4144c6b7645074f2ebb0", size = 127938, upload_time = "2025-05-26T15:17:54.19Z" }, + { url = "https://files.pythonhosted.org/packages/90/54/6a876d9cc8d401a9c1fb6bb8ca5a31b3664d0bcb888a9016258a1ae17344/preshed-3.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dd1f0a7b7d150e229d073fd4fe94f72610cae992e907cee74687c4695873a98", size = 842263, upload_time = "2025-05-26T15:17:55.398Z" }, + { url = "https://files.pythonhosted.org/packages/1c/7d/ff19f74d15ee587905bafa3582883cfe2f72b574e6d691ee64dc690dc276/preshed-3.0.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fd7b350c280137f324cd447afbf6ba9a849af0e8898850046ac6f34010e08bd", size = 842913, upload_time = "2025-05-26T15:17:56.687Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/1c345a26463345557705b61965e1e0a732cc0e9c6dfd4787845dbfa50b4a/preshed-3.0.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cf6a5fdc89ad06079aa6ee63621e417d4f4cf2a3d8b63c72728baad35a9ff641", size = 820548, upload_time = "2025-05-26T15:17:58.057Z" }, + { url = "https://files.pythonhosted.org/packages/7f/6b/71f25e2b7a23dba168f43edfae0bb508552dbef89114ce65c73f2ea7172f/preshed-3.0.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b4c29a7bd66985808ad181c9ad05205a6aa7400cd0f98426acd7bc86588b93f8", size = 840379, upload_time = "2025-05-26T15:17:59.565Z" }, + { url = "https://files.pythonhosted.org/packages/3a/86/d8f32b0b31a36ee8770a9b1a95321430e364cd0ba4bfebb7348aed2f198d/preshed-3.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:1367c1fd6f44296305315d4e1c3fe3171787d4d01c1008a76bc9466bd79c3249", size = 117655, upload_time = "2025-05-26T15:18:00.836Z" }, + { url = "https://files.pythonhosted.org/packages/c3/14/322a4f58bc25991a87f216acb1351800739b0794185d27508ee86c35f382/preshed-3.0.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6e9c46933d55c8898c8f7a6019a8062cd87ef257b075ada2dd5d1e57810189ea", size = 131367, upload_time = "2025-05-26T15:18:02.408Z" }, + { url = "https://files.pythonhosted.org/packages/38/80/67507653c35620cace913f617df6d6f658b87e8da83087b851557d65dd86/preshed-3.0.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c4ebc4f8ef0114d55f2ffdce4965378129c7453d0203664aeeb03055572d9e4", size = 126535, upload_time = "2025-05-26T15:18:03.589Z" }, + { url = "https://files.pythonhosted.org/packages/db/b1/ab4f811aeaf20af0fa47148c1c54b62d7e8120d59025bd0a3f773bb67725/preshed-3.0.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ab5ab4c6dfd3746fb4328e7fbeb2a0544416b872db02903bfac18e6f5cd412f", size = 864907, upload_time = "2025-05-26T15:18:04.794Z" }, + { url = "https://files.pythonhosted.org/packages/fb/db/fe37c1f99cfb26805dd89381ddd54901307feceb267332eaaca228e9f9c1/preshed-3.0.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40586fd96ae3974c552a7cd78781b6844ecb1559ee7556586f487058cf13dd96", size = 869329, upload_time = "2025-05-26T15:18:06.353Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fd/efb6a6233d1cd969966f3f65bdd8e662579c3d83114e5c356cec1927b1f7/preshed-3.0.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a606c24cda931306b98e0edfafed3309bffcf8d6ecfe07804db26024c4f03cd6", size = 846829, upload_time = "2025-05-26T15:18:07.716Z" }, + { url = "https://files.pythonhosted.org/packages/14/49/0e4ce5db3bf86b081abb08a404fb37b7c2dbfd7a73ec6c0bc71b650307eb/preshed-3.0.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:394015566f9354738be903447039e8dbc6d93ba5adf091af694eb03c4e726b1e", size = 874008, upload_time = "2025-05-26T15:18:09.364Z" }, + { url = "https://files.pythonhosted.org/packages/6f/17/76d6593fc2d055d4e413b68a8c87b70aa9b7697d4972cb8062559edcf6e9/preshed-3.0.10-cp312-cp312-win_amd64.whl", hash = "sha256:fd7e38225937e580420c84d1996dde9b4f726aacd9405093455c3a2fa60fede5", size = 116701, upload_time = "2025-05-26T15:18:11.905Z" }, + { url = "https://files.pythonhosted.org/packages/bf/5e/87671bc58c4f6c8cf0a5601ccd74b8bb50281ff28aa4ab3e3cad5cd9d06a/preshed-3.0.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:23e6e0581a517597f3f76bc24a4cdb0ba5509933d4f61c34fca49649dd71edf9", size = 129184, upload_time = "2025-05-26T15:18:13.331Z" }, + { url = "https://files.pythonhosted.org/packages/92/69/b3969a3c95778def5bf5126484a1f7d2ad324d1040077f55f56e027d8ea4/preshed-3.0.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:574e6d6056981540310ff181b47a2912f4bddc91bcace3c7a9c6726eafda24ca", size = 124258, upload_time = "2025-05-26T15:18:14.497Z" }, + { url = "https://files.pythonhosted.org/packages/32/df/6e828ec4565bf33bd4803a3eb3b1102830b739143e5d6c132bf7181a58ec/preshed-3.0.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd658dd73e853d1bb5597976a407feafa681b9d6155bc9bc7b4c2acc2a6ee96", size = 825445, upload_time = "2025-05-26T15:18:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/05/3d/478b585f304920e51f328c9231e22f30dc64baa68e079e08a46ab72be738/preshed-3.0.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b95396046328ffb461a68859ce2141aca4815b8624167832d28ced70d541626", size = 831690, upload_time = "2025-05-26T15:18:17.08Z" }, + { url = "https://files.pythonhosted.org/packages/c3/65/938f21f77227e8d398d46fb10b9d1b3467be859468ce8db138fc3d50589c/preshed-3.0.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3e6728b2028bbe79565eb6cf676b5bae5ce1f9cc56e4bf99bb28ce576f88054d", size = 808593, upload_time = "2025-05-26T15:18:18.535Z" }, + { url = "https://files.pythonhosted.org/packages/6c/1c/2a3961fc88bc72300ff7e4ca54689bda90d2d77cc994167cc09a310480b6/preshed-3.0.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c4ef96cb28bf5f08de9c070143113e168efccbb68fd4961e7d445f734c051a97", size = 837333, upload_time = "2025-05-26T15:18:19.937Z" }, + { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload_time = "2025-05-26T15:18:21.842Z" }, +] + [[package]] name = "propcache" version = "0.3.1" @@ -2154,6 +2387,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload_time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "smart-open" +version = "7.3.0.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/2b/5e7234c68ed5bc872ad6ae77b8a421c2ed70dcb1190b44dc1abdeed5e347/smart_open-7.3.0.post1.tar.gz", hash = "sha256:ce6a3d9bc1afbf6234ad13c010b77f8cd36d24636811e3c52c3b5160f5214d1e", size = 51557, upload_time = "2025-07-03T10:06:31.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4", size = 61946, upload_time = "2025-07-03T10:06:29.599Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -2172,6 +2417,74 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload_time = "2025-04-20T18:50:07.196Z" }, ] +[[package]] +name = "spacy" +version = "3.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "catalogue" }, + { name = "cymem" }, + { name = "jinja2" }, + { name = "langcodes" }, + { name = "murmurhash" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "preshed" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "spacy-legacy" }, + { name = "spacy-loggers" }, + { name = "srsly" }, + { name = "thinc" }, + { name = "tqdm" }, + { name = "typer" }, + { name = "wasabi" }, + { name = "weasel" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1e/9e/fb4e1cefe3fbd51ea6a243e5a3d2bc629baa9a28930bf4be6fe5672fa1ca/spacy-3.8.7.tar.gz", hash = "sha256:700fd174c6c552276be142c48e70bb53cae24c4dd86003c4432af9cb93e4c908", size = 1316143, upload_time = "2025-05-23T08:55:39.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/c5/5fbb3a4e694d4855a5bab87af9664377c48b89691f180ad3cde4faeaf35c/spacy-3.8.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdff8b9b556468a6dd527af17f0ddf9fb0b0bee92ee7703339ddf542361cff98", size = 6746140, upload_time = "2025-05-23T08:54:23.483Z" }, + { url = "https://files.pythonhosted.org/packages/03/2a/43afac516eb82409ca47d7206f982beaf265d2ba06a72ca07cf06b290c20/spacy-3.8.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9194b7cf015ed9b4450ffb162da49c8a9305e76b468de036b0948abdfc748a37", size = 6392440, upload_time = "2025-05-23T08:54:25.12Z" }, + { url = "https://files.pythonhosted.org/packages/6f/83/2ea68c18e2b1b9a6f6b30ef63eb9d07e979626b9595acfdb5394f18923c4/spacy-3.8.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7dc38b78d48b9c2a80a3eea95f776304993f63fc307f07cdd104441442f92f1e", size = 32699126, upload_time = "2025-05-23T08:54:27.385Z" }, + { url = "https://files.pythonhosted.org/packages/0a/0a/bb90e9aa0b3c527876627567d82517aabab08006ccf63796c33b0242254d/spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e43bd70772751b8fc7a14f338d087a3d297195d43d171832923ef66204b23ab", size = 33008865, upload_time = "2025-05-23T08:54:30.248Z" }, + { url = "https://files.pythonhosted.org/packages/39/dd/8e906ba378457107ab0394976ea9f7b12fdb2cad682ef1a2ccf473d61e5f/spacy-3.8.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c402bf5dcf345fd96d202378c54bc345219681e3531f911d99567d569328c45f", size = 31933169, upload_time = "2025-05-23T08:54:33.199Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b5/42df07eb837a923fbb42509864d5c7c2072d010de933dccdfb3c655b3a76/spacy-3.8.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4234189861e486d86f1269e50542d87e8a6391a1ee190652479cf1a793db115f", size = 32776322, upload_time = "2025-05-23T08:54:36.891Z" }, + { url = "https://files.pythonhosted.org/packages/92/e7/8176484801c67dcd814f141991fe0a3c9b5b4a3583ea30c2062e93d1aa6b/spacy-3.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:e9d12e2eb7f36bc11dd9edae011032fe49ea100d63e83177290d3cbd80eaa650", size = 14938936, upload_time = "2025-05-23T08:54:40.322Z" }, + { url = "https://files.pythonhosted.org/packages/a5/10/89852f40f926e0902c11c34454493ba0d15530b322711e754b89a6d7dfe6/spacy-3.8.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:88b397e37793cea51df298e6c651a763e49877a25bead5ba349761531a456687", size = 6265335, upload_time = "2025-05-23T08:54:42.876Z" }, + { url = "https://files.pythonhosted.org/packages/16/fb/b5d54522969a632c06f4af354763467553b66d5bf0671ac39f3cceb3fd54/spacy-3.8.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f70b676955fa6959347ca86ed6edd8ff0d6eb2ba20561fdfec76924bd3e540f9", size = 5906035, upload_time = "2025-05-23T08:54:44.824Z" }, + { url = "https://files.pythonhosted.org/packages/3a/03/70f06753fd65081404ade30408535eb69f627a36ffce2107116d1aa16239/spacy-3.8.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c4b5a624797ade30c25b5b69daa35a93ee24bcc56bd79b0884b2565f76f35d6", size = 33420084, upload_time = "2025-05-23T08:54:46.889Z" }, + { url = "https://files.pythonhosted.org/packages/f9/19/b60e1ebf4985ee2b33d85705b89a5024942b65dad04dbdc3fb46f168b410/spacy-3.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9d83e006df66decccefa3872fa958b3756228fb216d83783595444cf42ca10c", size = 33922188, upload_time = "2025-05-23T08:54:49.781Z" }, + { url = "https://files.pythonhosted.org/packages/8f/a3/1fb1a49dc6d982d96fffc30c3a31bb431526008eea72ac3773f6518720a6/spacy-3.8.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dca25deba54f3eb5dcfbf63bf16e613e6c601da56f91c4a902d38533c098941", size = 31939285, upload_time = "2025-05-23T08:54:53.162Z" }, + { url = "https://files.pythonhosted.org/packages/2d/55/6cf1aff8e5c01ee683e828f3ccd9282d2aff7ca1143a9349ee3d0c1291ff/spacy-3.8.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5eef3f805a1c118d9b709a23e2d378f5f20da5a0d6258c9cfdc87c4cb234b4fc", size = 32988845, upload_time = "2025-05-23T08:54:57.776Z" }, + { url = "https://files.pythonhosted.org/packages/8c/47/c17ee61b51aa8497d8af0999224b4b62485111a55ec105a06886685b2c68/spacy-3.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:25d7a68e445200c9e9dc0044f8b7278ec0ef01ccc7cb5a95d1de2bd8e3ed6be2", size = 13918682, upload_time = "2025-05-23T08:55:00.387Z" }, + { url = "https://files.pythonhosted.org/packages/2a/95/7125bea6d432c601478bf922f7a568762c8be425bbde5b66698260ab0358/spacy-3.8.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dda7d57f42ec57c19fbef348095a9c82504e4777bca7b8db4b0d8318ba280fc7", size = 6235950, upload_time = "2025-05-23T08:55:02.92Z" }, + { url = "https://files.pythonhosted.org/packages/96/c3/d2362846154d4d341136774831605df02d61f49ac637524a15f4f2794874/spacy-3.8.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de0e0bddb810ed05bce44bcb91460eabe52bc56323da398d2ca74288a906da35", size = 5878106, upload_time = "2025-05-23T08:55:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/50/b6/b2943acfbfc4fc12642dac9feb571e712dd1569ab481db8f3daedee045fe/spacy-3.8.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a2e58f92b684465777a7c1a65d5578b1dc36fe55c48d9964fb6d46cc9449768", size = 33085866, upload_time = "2025-05-23T08:55:06.65Z" }, + { url = "https://files.pythonhosted.org/packages/65/98/c4415cbb217ac0b502dbb3372136015c699dd16a0c47cd6d338cd15f4bed/spacy-3.8.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46330da2eb357d6979f40ea8fc16ee5776ee75cd0c70aac2a4ea10c80364b8f3", size = 33398424, upload_time = "2025-05-23T08:55:10.477Z" }, + { url = "https://files.pythonhosted.org/packages/12/45/12a198858f1f11c21844876e039ba90df59d550527c72996d418c1faf78d/spacy-3.8.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:86b6a6ad23ca5440ef9d29c2b1e3125e28722c927db612ae99e564d49202861c", size = 31530066, upload_time = "2025-05-23T08:55:13.329Z" }, + { url = "https://files.pythonhosted.org/packages/9c/df/80524f99822eb96c9649200042ec5912357eec100cf0cd678a2e9ef0ecb3/spacy-3.8.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccfe468cbb370888153df145ce3693af8e54dae551940df49057258081b2112f", size = 32613343, upload_time = "2025-05-23T08:55:16.711Z" }, + { url = "https://files.pythonhosted.org/packages/02/99/881f6f24c279a5a70b8d69aaf8266fd411a0a58fd1c8848112aaa348f6f6/spacy-3.8.7-cp313-cp313-win_amd64.whl", hash = "sha256:ca81e416ff35209769e8b5dd5d13acc52e4f57dd9d028364bccbbe157c2ae86b", size = 13911250, upload_time = "2025-05-23T08:55:19.606Z" }, +] + +[[package]] +name = "spacy-legacy" +version = "3.0.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d9/79/91f9d7cc8db5642acad830dcc4b49ba65a7790152832c4eceb305e46d681/spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774", size = 23806, upload_time = "2023-01-23T09:04:15.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f", size = 29971, upload_time = "2023-01-23T09:04:13.45Z" }, +] + +[[package]] +name = "spacy-loggers" +version = "1.0.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/67/3d/926db774c9c98acf66cb4ed7faf6c377746f3e00b84b700d0868b95d0712/spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24", size = 20811, upload_time = "2023-09-11T12:26:52.323Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343, upload_time = "2023-09-11T12:26:50.586Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.40" @@ -2218,6 +2531,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload_time = "2024-12-10T12:05:27.824Z" }, ] +[[package]] +name = "srsly" +version = "2.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "catalogue" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/eb51b1349f50bac0222398af0942613fdc9d1453ae67cbe4bf9936a1a54b/srsly-2.5.1.tar.gz", hash = "sha256:ab1b4bf6cf3e29da23dae0493dd1517fb787075206512351421b89b4fc27c77e", size = 466464, upload_time = "2025-01-17T09:26:26.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/9c/a248bb49de499fe0990e3cb0fb341c2373d8863ef9a8b5799353cade5731/srsly-2.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58f0736794ce00a71d62a39cbba1d62ea8d5be4751df956e802d147da20ecad7", size = 635917, upload_time = "2025-01-17T09:25:25.109Z" }, + { url = "https://files.pythonhosted.org/packages/41/47/1bdaad84502df973ecb8ca658117234cf7fb20e1dec60da71dce82de993f/srsly-2.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8269c40859806d71920396d185f4f38dc985cdb6a28d3a326a701e29a5f629", size = 634374, upload_time = "2025-01-17T09:25:26.609Z" }, + { url = "https://files.pythonhosted.org/packages/e5/2a/d73c71989fcf2a6d1fa518d75322aff4db01a8763f167f8c5e00aac11097/srsly-2.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889905900401fefc1032e22b73aecbed8b4251aa363f632b2d1f86fc16f1ad8e", size = 1108390, upload_time = "2025-01-17T09:25:29.32Z" }, + { url = "https://files.pythonhosted.org/packages/35/a3/9eda9997a8bd011caed18fdaa5ce606714eb06d8dab587ed0522b3e92ab1/srsly-2.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf454755f22589df49c25dc799d8af7b47dce3d861dded35baf0f0b6ceab4422", size = 1110712, upload_time = "2025-01-17T09:25:31.051Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ef/4b50bc05d06349f905b27f824cc23b652098efd4be19aead3af4981df647/srsly-2.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc0607c8a59013a51dde5c1b4e465558728e9e0a35dcfa73c7cbefa91a0aad50", size = 1081244, upload_time = "2025-01-17T09:25:32.611Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/d4a2512d9a5048d2b18efead39d4c4404bddd4972935bbc68211292a736c/srsly-2.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d5421ba3ab3c790e8b41939c51a1d0f44326bfc052d7a0508860fb79a47aee7f", size = 1091692, upload_time = "2025-01-17T09:25:34.15Z" }, + { url = "https://files.pythonhosted.org/packages/bb/da/657a685f63028dcb00ccdc4ac125ed347c8bff6fa0dab6a9eb3dc45f3223/srsly-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:b96ea5a9a0d0379a79c46d255464a372fb14c30f59a8bc113e4316d131a530ab", size = 632627, upload_time = "2025-01-17T09:25:37.36Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f6/bebc20d75bd02121fc0f65ad8c92a5dd2570e870005e940faa55a263e61a/srsly-2.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:683b54ed63d7dfee03bc2abc4b4a5f2152f81ec217bbadbac01ef1aaf2a75790", size = 636717, upload_time = "2025-01-17T09:25:40.236Z" }, + { url = "https://files.pythonhosted.org/packages/b6/e8/9372317a4742c70b87b413335adfcdfb2bee4f88f3faba89fabb9e6abf21/srsly-2.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:459d987130e57e83ce9e160899afbeb871d975f811e6958158763dd9a8a20f23", size = 634697, upload_time = "2025-01-17T09:25:43.605Z" }, + { url = "https://files.pythonhosted.org/packages/d5/00/c6a7b99ab27b051a27bd26fe1a8c1885225bb8980282bf9cb99f70610368/srsly-2.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:184e3c98389aab68ff04aab9095bd5f1a8e5a72cc5edcba9d733bac928f5cf9f", size = 1134655, upload_time = "2025-01-17T09:25:45.238Z" }, + { url = "https://files.pythonhosted.org/packages/c2/e6/861459e8241ec3b78c111081bd5efa414ef85867e17c45b6882954468d6e/srsly-2.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c2a3e4856e63b7efd47591d049aaee8e5a250e098917f50d93ea68853fab78", size = 1143544, upload_time = "2025-01-17T09:25:47.485Z" }, + { url = "https://files.pythonhosted.org/packages/2d/85/8448fe874dd2042a4eceea5315cfff3af03ac77ff5073812071852c4e7e2/srsly-2.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:366b4708933cd8d6025c13c2cea3331f079c7bb5c25ec76fca392b6fc09818a0", size = 1098330, upload_time = "2025-01-17T09:25:52.55Z" }, + { url = "https://files.pythonhosted.org/packages/ef/7e/04d0e1417da140b2ac4053a3d4fcfc86cd59bf4829f69d370bb899f74d5d/srsly-2.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8a0b03c64eb6e150d772c5149befbadd981cc734ab13184b0561c17c8cef9b1", size = 1110670, upload_time = "2025-01-17T09:25:54.02Z" }, + { url = "https://files.pythonhosted.org/packages/96/1a/a8cd627eaa81a91feb6ceab50155f4ceff3eef6107916cb87ef796958427/srsly-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:7952538f6bba91b9d8bf31a642ac9e8b9ccc0ccbb309feb88518bfb84bb0dc0d", size = 632598, upload_time = "2025-01-17T09:25:55.499Z" }, + { url = "https://files.pythonhosted.org/packages/42/94/cab36845aad6e2c22ecee1178accaa365657296ff87305b805648fd41118/srsly-2.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84b372f7ef1604b4a5b3cee1571993931f845a5b58652ac01bcb32c52586d2a8", size = 634883, upload_time = "2025-01-17T09:25:58.363Z" }, + { url = "https://files.pythonhosted.org/packages/67/8b/501f51f4eaee7e1fd7327764799cb0a42f5d0de042a97916d30dbff770fc/srsly-2.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6ac3944c112acb3347a39bfdc2ebfc9e2d4bace20fe1c0b764374ac5b83519f2", size = 632842, upload_time = "2025-01-17T09:25:59.777Z" }, + { url = "https://files.pythonhosted.org/packages/07/be/5b8fce4829661e070a7d3e262d2e533f0e297b11b8993d57240da67d7330/srsly-2.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6118f9c4b221cde0a990d06a42c8a4845218d55b425d8550746fe790acf267e9", size = 1118516, upload_time = "2025-01-17T09:26:01.234Z" }, + { url = "https://files.pythonhosted.org/packages/91/60/a34e97564eac352c0e916c98f44b6f566b7eb6a9fb60bcd60ffa98530762/srsly-2.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7481460110d9986781d9e4ac0f5f991f1d6839284a80ad268625f9a23f686950", size = 1127974, upload_time = "2025-01-17T09:26:04.007Z" }, + { url = "https://files.pythonhosted.org/packages/70/a2/f642334db0cabd187fa86b8773257ee6993c6009338a6831d4804e2c5b3c/srsly-2.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e57b8138082f09e35db60f99757e16652489e9e3692471d8e0c39aa95180688", size = 1086098, upload_time = "2025-01-17T09:26:05.612Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/be48e185c5a010e71b5135e4cdf317ff56b8ac4bc08f394bbf882ac13b05/srsly-2.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bab90b85a63a1fe0bbc74d373c8bb9bb0499ddfa89075e0ebe8d670f12d04691", size = 1100354, upload_time = "2025-01-17T09:26:07.215Z" }, + { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload_time = "2025-01-17T09:26:10.018Z" }, +] + [[package]] name = "starlette" version = "0.45.3" @@ -2239,6 +2584,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload_time = "2019-08-30T21:37:03.543Z" }, ] +[[package]] +name = "thinc" +version = "8.3.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blis" }, + { name = "catalogue" }, + { name = "confection" }, + { name = "cymem" }, + { name = "murmurhash" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "preshed" }, + { name = "pydantic" }, + { name = "setuptools" }, + { name = "srsly" }, + { name = "wasabi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ff/60c9bcfe28e56c905aac8e61a838c7afe5dc3073c9beed0b63a26ace0bb7/thinc-8.3.4.tar.gz", hash = "sha256:b5925482498bbb6dca0771e375b35c915818f735891e93d93a662dab15f6ffd8", size = 193903, upload_time = "2025-01-13T12:47:51.698Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/47/68187c78a04cdc31cbd3ae393068f994b60476b5ecac6dfe7d04b124aacf/thinc-8.3.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8bb4b47358a1855803b375f4432cefdf373f46ef249b554418d2e77c7323040", size = 839320, upload_time = "2025-01-13T12:47:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/49/ea/066dd415e61fcef20083bbca41c2c02e640fea71326531f2619708efee1e/thinc-8.3.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:00ed92f9a34b9794f51fcd48467c863f4eb7c5b41559aef6ef3c980c21378fec", size = 774196, upload_time = "2025-01-13T12:47:15.315Z" }, + { url = "https://files.pythonhosted.org/packages/8c/68/36c1a92a374891e0d496677c59f5f9fdc1e57bbb214c487bb8bb3e9290c2/thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85691fca84a6a1506f7ddbd2c1706a5524d56f65582e76b2e260a06d9e83e86d", size = 3922504, upload_time = "2025-01-13T12:47:22.07Z" }, + { url = "https://files.pythonhosted.org/packages/ec/8a/48e463240a586e91f83c87660986e520aa91fbd839f6631ee9bc0fbb3cbd/thinc-8.3.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:eae1573fc19e514defc1bfd4f93f0b4bfc1dcefdb6d70bad1863825747f24800", size = 4932946, upload_time = "2025-01-13T12:47:24.177Z" }, + { url = "https://files.pythonhosted.org/packages/d9/98/f910b8d8113ab9b955a68e9bbf0d5bd0e828f22dd6d3c226af6ec3970817/thinc-8.3.4-cp311-cp311-win_amd64.whl", hash = "sha256:81e8638f9bdc38e366674acc4b63cf7c6267266a15477963a5db21b3d9f1aa36", size = 1490133, upload_time = "2025-01-13T12:47:26.152Z" }, + { url = "https://files.pythonhosted.org/packages/90/ff/d1b5d7e1a7f95581e9a736f50a5a9aff72327ddbbc629a68070c36acefd9/thinc-8.3.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c9da6375b106df5186bd2bfd1273bc923c01ab7d482f8942e4ee528a28965c3a", size = 825099, upload_time = "2025-01-13T12:47:27.881Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0b/d207c917886dc40671361de0880ec3ea0443a718aae9dbb0a50ac0849f92/thinc-8.3.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:07091c6b5faace50857c4cf0982204969d77388d0a6f156dd2442297dceeb838", size = 761024, upload_time = "2025-01-13T12:47:29.739Z" }, + { url = "https://files.pythonhosted.org/packages/4b/a3/3ec5e9d7cbebc3257b8223a3d188216b91ab6ec1e66b6fdd99d22394bc62/thinc-8.3.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd40ad71bcd8b1b9daa0462e1255b1c1e86e901c2fd773966601f44a95878032", size = 3710390, upload_time = "2025-01-13T12:47:33.019Z" }, + { url = "https://files.pythonhosted.org/packages/40/ee/955c74e4e6ff2f694c99dcbbf7be8d478a8868503aeb3474517277c07667/thinc-8.3.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb10823b3a3f1c6440998b11bf9a3571dd859feaed0fdb510a1c1097d9dc6a86", size = 4731524, upload_time = "2025-01-13T12:47:35.203Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/3786431e5c1eeebed3d7a4c97122896ca6d4a502b03d02c2171c417052fd/thinc-8.3.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5e5e7bf5dae142fd50ed9785971292c4aab4d9ed18e4947653b6a0584d5227c", size = 1455883, upload_time = "2025-01-13T12:47:36.914Z" }, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -2376,6 +2753,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload_time = "2024-10-14T23:38:10.888Z" }, ] +[[package]] +name = "wasabi" +version = "1.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload_time = "2024-05-31T16:56:18.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload_time = "2024-05-31T16:56:16.699Z" }, +] + [[package]] name = "watchfiles" version = "1.0.5" @@ -2425,6 +2814,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/b4/c57b99518fadf431f3ef47a610839e46e5f8abf9814f969859d1c65c02c7/watchfiles-1.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:f436601594f15bf406518af922a89dcaab416568edb6f65c4e5bbbad1ea45c11", size = 291087, upload_time = "2025-04-08T10:35:52.458Z" }, ] +[[package]] +name = "weasel" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cloudpathlib" }, + { name = "confection" }, + { name = "packaging" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "smart-open" }, + { name = "srsly" }, + { name = "typer" }, + { name = "wasabi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/1a/9c522dd61b52939c217925d3e55c95f9348b73a66a956f52608e1e59a2c0/weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9", size = 38417, upload_time = "2024-05-15T08:52:54.765Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/87/abd57374044e1f627f0a905ac33c1a7daab35a3a815abfea4e1bafd3fdb1/weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c", size = 50270, upload_time = "2024-05-15T08:52:52.977Z" }, +] + [[package]] name = "websockets" version = "15.0.1" @@ -2467,6 +2876,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload_time = "2025-03-05T20:03:39.41Z" }, ] +[[package]] +name = "wrapt" +version = "1.17.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload_time = "2025-08-12T05:53:21.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/db/00e2a219213856074a213503fdac0511203dceefff26e1daa15250cc01a0/wrapt-1.17.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:273a736c4645e63ac582c60a56b0acb529ef07f78e08dc6bfadf6a46b19c0da7", size = 53482, upload_time = "2025-08-12T05:51:45.79Z" }, + { url = "https://files.pythonhosted.org/packages/5e/30/ca3c4a5eba478408572096fe9ce36e6e915994dd26a4e9e98b4f729c06d9/wrapt-1.17.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5531d911795e3f935a9c23eb1c8c03c211661a5060aab167065896bbf62a5f85", size = 38674, upload_time = "2025-08-12T05:51:34.629Z" }, + { url = "https://files.pythonhosted.org/packages/31/25/3e8cc2c46b5329c5957cec959cb76a10718e1a513309c31399a4dad07eb3/wrapt-1.17.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0610b46293c59a3adbae3dee552b648b984176f8562ee0dba099a56cfbe4df1f", size = 38959, upload_time = "2025-08-12T05:51:56.074Z" }, + { url = "https://files.pythonhosted.org/packages/5d/8f/a32a99fc03e4b37e31b57cb9cefc65050ea08147a8ce12f288616b05ef54/wrapt-1.17.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b32888aad8b6e68f83a8fdccbf3165f5469702a7544472bdf41f582970ed3311", size = 82376, upload_time = "2025-08-12T05:52:32.134Z" }, + { url = "https://files.pythonhosted.org/packages/31/57/4930cb8d9d70d59c27ee1332a318c20291749b4fba31f113c2f8ac49a72e/wrapt-1.17.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cccf4f81371f257440c88faed6b74f1053eef90807b77e31ca057b2db74edb1", size = 83604, upload_time = "2025-08-12T05:52:11.663Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/1afd48de81d63dd66e01b263a6fbb86e1b5053b419b9b33d13e1f6d0f7d0/wrapt-1.17.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8a210b158a34164de8bb68b0e7780041a903d7b00c87e906fb69928bf7890d5", size = 82782, upload_time = "2025-08-12T05:52:12.626Z" }, + { url = "https://files.pythonhosted.org/packages/1e/d7/4ad5327612173b144998232f98a85bb24b60c352afb73bc48e3e0d2bdc4e/wrapt-1.17.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:79573c24a46ce11aab457b472efd8d125e5a51da2d1d24387666cd85f54c05b2", size = 82076, upload_time = "2025-08-12T05:52:33.168Z" }, + { url = "https://files.pythonhosted.org/packages/bb/59/e0adfc831674a65694f18ea6dc821f9fcb9ec82c2ce7e3d73a88ba2e8718/wrapt-1.17.3-cp311-cp311-win32.whl", hash = "sha256:c31eebe420a9a5d2887b13000b043ff6ca27c452a9a22fa71f35f118e8d4bf89", size = 36457, upload_time = "2025-08-12T05:53:03.936Z" }, + { url = "https://files.pythonhosted.org/packages/83/88/16b7231ba49861b6f75fc309b11012ede4d6b0a9c90969d9e0db8d991aeb/wrapt-1.17.3-cp311-cp311-win_amd64.whl", hash = "sha256:0b1831115c97f0663cb77aa27d381237e73ad4f721391a9bfb2fe8bc25fa6e77", size = 38745, upload_time = "2025-08-12T05:53:02.885Z" }, + { url = "https://files.pythonhosted.org/packages/9a/1e/c4d4f3398ec073012c51d1c8d87f715f56765444e1a4b11e5180577b7e6e/wrapt-1.17.3-cp311-cp311-win_arm64.whl", hash = "sha256:5a7b3c1ee8265eb4c8f1b7d29943f195c00673f5ab60c192eba2d4a7eae5f46a", size = 36806, upload_time = "2025-08-12T05:52:53.368Z" }, + { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload_time = "2025-08-12T05:51:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload_time = "2025-08-12T05:51:35.906Z" }, + { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload_time = "2025-08-12T05:51:57.474Z" }, + { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload_time = "2025-08-12T05:52:34.784Z" }, + { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload_time = "2025-08-12T05:52:13.599Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload_time = "2025-08-12T05:52:14.56Z" }, + { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload_time = "2025-08-12T05:52:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload_time = "2025-08-12T05:53:07.123Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload_time = "2025-08-12T05:53:05.436Z" }, + { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload_time = "2025-08-12T05:52:54.367Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload_time = "2025-08-12T05:51:48.627Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload_time = "2025-08-12T05:51:37.156Z" }, + { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload_time = "2025-08-12T05:51:58.425Z" }, + { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload_time = "2025-08-12T05:52:37.53Z" }, + { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload_time = "2025-08-12T05:52:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload_time = "2025-08-12T05:52:17.914Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload_time = "2025-08-12T05:52:39.243Z" }, + { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload_time = "2025-08-12T05:53:10.074Z" }, + { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload_time = "2025-08-12T05:53:08.695Z" }, + { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload_time = "2025-08-12T05:52:55.34Z" }, + { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload_time = "2025-08-12T05:51:49.864Z" }, + { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload_time = "2025-08-12T05:51:38.935Z" }, + { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload_time = "2025-08-12T05:51:59.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload_time = "2025-08-12T05:52:40.965Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload_time = "2025-08-12T05:52:20.326Z" }, + { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload_time = "2025-08-12T05:52:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload_time = "2025-08-12T05:52:43.043Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload_time = "2025-08-12T05:53:12.605Z" }, + { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload_time = "2025-08-12T05:53:11.106Z" }, + { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload_time = "2025-08-12T05:52:56.531Z" }, + { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload_time = "2025-08-12T05:51:51.109Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload_time = "2025-08-12T05:51:39.912Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload_time = "2025-08-12T05:52:00.693Z" }, + { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload_time = "2025-08-12T05:52:44.521Z" }, + { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload_time = "2025-08-12T05:52:22.618Z" }, + { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload_time = "2025-08-12T05:52:24.057Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload_time = "2025-08-12T05:52:45.976Z" }, + { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload_time = "2025-08-12T05:53:15.214Z" }, + { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload_time = "2025-08-12T05:53:14.178Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload_time = "2025-08-12T05:52:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload_time = "2025-08-12T05:53:20.674Z" }, +] + [[package]] name = "xxhash" version = "3.5.0" From a8acbda286a0b9806a100e2365d0ab3671a4c089 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 4 Sep 2025 06:39:24 -0400 Subject: [PATCH 17/33] Update Draft --- ...aee0dd79_overhaul_agency_identification.py | 46 ++-------- .../agency/get/queries/next_for_annotation.py | 28 ++---- src/api/endpoints/review/next/convert.py | 90 +++++++++++++++++++ src/api/endpoints/review/next/query.py | 3 +- src/db/dto_converter.py | 67 -------------- src/db/models/impl/url/core/sqlalchemy.py | 3 + .../suggestion/agency/subtask/sqlalchemy.py | 7 ++ .../agency/suggestion/sqlalchemy.py | 6 +- .../views/has_agency_auto_suggestion.py | 31 ------- .../integration/db/structure/test_view.py | 70 --------------- 10 files changed, 125 insertions(+), 226 deletions(-) create mode 100644 src/api/endpoints/review/next/convert.py delete mode 100644 src/db/models/views/has_agency_auto_suggestion.py delete mode 100644 tests/automated/integration/db/structure/test_view.py diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index a255fa45..a58c5e56 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -46,7 +46,6 @@ def upgrade() -> None: _create_url_auto_agency_subtask_table() _create_url_unknown_agencies_view() _create_link_agency_id_subtask_agencies_table() - _create_url_has_agency_suggestions_view() _create_new_url_annotation_flags_view() _drop_url_auto_agency_suggestions_table() @@ -55,7 +54,6 @@ def downgrade() -> None: _drop_url_unknown_agencies_view() _create_url_auto_agency_suggestions_table() _create_old_url_annotation_flags_view() - _drop_url_has_agency_suggestions_view() _drop_link_agency_id_subtask_agencies_table() _drop_url_auto_agency_subtask_table() SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) @@ -66,47 +64,26 @@ def _drop_url_auto_agency_suggestions_table(): def _create_new_url_annotation_flags_view(): + op.execute( f""" CREATE OR REPLACE VIEW url_annotation_flags AS ( SELECT u.id, - CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, - CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, - auas.has_agency_suggestions AS has_auto_agency_suggestion, - CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, - CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, - CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, - CASE WHEN lua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, - CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, + EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, + EXISTS (SELECT 1 FROM public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, + EXISTS (SELECT 1 FROM public.user_record_type_suggestions a WHERE a.url_id = u.id) AS has_user_record_type_suggestion, + EXISTS (SELECT 1 FROM public.user_relevant_suggestions a WHERE a.url_id = u.id) AS has_user_relevant_suggestion, + EXISTS (SELECT 1 FROM public.user_url_agency_suggestions a WHERE a.url_id = u.id) AS has_user_agency_suggestion, + EXISTS (SELECT 1 FROM public.link_urls_agency a WHERE a.url_id = u.id) AS has_confirmed_agency, + EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed FROM urls u - LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id - LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id - LEFT JOIN public.{URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME} auas ON u.id = auas.url_id - LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id - LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id - LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id - LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id - LEFT JOIN public.link_urls_agency lua on u.id = lua.url_id ) """ ) -def _create_url_has_agency_suggestions_view(): - op.execute( - f""" - CREATE OR REPLACE VIEW {URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME} AS - SELECT - u.id as url_id, - (uas.id IS NOT NULL) AS has_agency_suggestions - FROM public.urls u - LEFT JOIN public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} uas on u.id = uas.url_id - """ - ) - pass - - def _create_url_unknown_agencies_view(): op.execute( f""" @@ -212,11 +189,6 @@ def _create_url_auto_agency_suggestions_table(): def _drop_url_unknown_agencies_view(): op.execute(f"DROP VIEW IF EXISTS {URL_UNKNOWN_AGENCIES_VIEW_NAME}") - -def _drop_url_has_agency_suggestions_view(): - op.execute(f"DROP VIEW IF EXISTS {URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME}") - - def _drop_url_annotation_flags_view(): op.execute("DROP VIEW url_annotation_flags;") diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index e8f22870..5fd8cea9 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -14,6 +14,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder @@ -50,28 +51,17 @@ async def run( URL.status == URLStatus.OK.value ) - - # Must not have been annotated by a user query = ( - query.join(UserUrlAgencySuggestion, isouter=True) + query.join( + URLAnnotationFlagsView, + URLAnnotationFlagsView.url_id == URL.id + ) + # Must not have been annotated by a user .where( - ~exists( - select(UserUrlAgencySuggestion). - where(UserUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) + URLAnnotationFlagsView.has_user_agency_suggestion.is_(False), + # Must have extant autosuggestions + URLAnnotationFlagsView.has_auto_agency_suggestion.is_(True) ) - # Must have extant autosuggestions - # TODO: Replace with new logic - # .join(AutomatedUrlAgencySuggestion, isouter=True) - # .where( - # exists( - # select(AutomatedUrlAgencySuggestion). - # where(AutomatedUrlAgencySuggestion.url_id == URL.id). - # correlate(URL) - # ) - # ) - # Must not have confirmed agencies .join(LinkURLAgency, isouter=True) .where( ~exists( diff --git a/src/api/endpoints/review/next/convert.py b/src/api/endpoints/review/next/convert.py new file mode 100644 index 00000000..ba443a8f --- /dev/null +++ b/src/api/endpoints/review/next/convert.py @@ -0,0 +1,90 @@ +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.review.next.dto import FinalReviewAnnotationAgencyInfo +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion + + +def convert_agency_info_to_final_review_annotation_agency_info( + automated_agency_suggestions: list[None], + confirmed_agencies: list[LinkURLAgency], + user_agency_suggestion: UserUrlAgencySuggestion +) -> FinalReviewAnnotationAgencyInfo: + + confirmed_agency_info = _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies + ) + + # TODO: Revise + # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( + # automated_agency_suggestions + # ) + agency_auto_info = None + + agency_user_info = _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestion + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + user=agency_user_info, + auto=agency_auto_info + ) + +def _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies: list[LinkURLAgency] +) -> list[GetNextURLForAgencyAgencyInfo]: + results: list[GetNextURLForAgencyAgencyInfo] = [] + for confirmed_agency in confirmed_agencies: + agency = confirmed_agency.agency + agency_info = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.CONFIRMED, + agency=agency + ) + results.append(agency_info) + return results + +def _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_url_agency_suggestion: UserUrlAgencySuggestion +) -> GetNextURLForAgencyAgencyInfo | None: + suggestion = user_url_agency_suggestion + if suggestion is None: + return None + if suggestion.is_new: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.NEW_AGENCY, + ) + return _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.USER_SUGGESTION, + agency=suggestion.agency + ) + +def _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type: SuggestionType, + agency: Agency +) -> GetNextURLForAgencyAgencyInfo: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + pdap_agency_id=agency.agency_id, + agency_name=agency.name, + state=agency.state, + county=agency.county, + locality=agency.locality + ) + +def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks: list[URLAutoAgencyIDSubtask] +) -> list[GetNextURLForAgencyAgencyInfo]: + results: list[GetNextURLForAgencyAgencyInfo] = [] + for subtask in subtasks: + if not subtask.agencies_found: + continue + for suggestion in subtask.suggestions: + info: GetNextURLForAgencyAgencyInfo = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + agency=suggestion.agency + ) + results.append(info) + return results \ No newline at end of file diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 8c50a7af..9e87737c 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -4,6 +4,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload +from src.api.endpoints.review.next.convert import convert_agency_info_to_final_review_annotation_agency_info from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.collectors.enums import URLStatus @@ -263,7 +264,7 @@ async def run( user_suggestion=result.user_record_type_suggestion, auto_suggestion=result.auto_record_type_suggestion ), - agency=DTOConverter.final_review_annotation_agency_info( + agency=convert_agency_info_to_final_review_annotation_agency_info( automated_agency_suggestions=result.automated_agency_suggestions, user_agency_suggestion=result.user_agency_suggestion, confirmed_agencies=result.confirmed_agencies diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 39b53b89..b19b834d 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -62,73 +62,6 @@ def final_review_annotation_record_type_info( user=user_value ) - @staticmethod - def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_url_agency_suggestion: UserUrlAgencySuggestion - ) -> GetNextURLForAgencyAgencyInfo | None: - suggestion = user_url_agency_suggestion - if suggestion is None: - return None - if suggestion.is_new: - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.NEW_AGENCY, - ) - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.USER_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) - - - @staticmethod - def confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies: list[LinkURLAgency] - ) -> list[GetNextURLForAgencyAgencyInfo]: - results = [] - for confirmed_agency in confirmed_agencies: - agency = confirmed_agency.agency - agency_info = GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=agency.agency_id, - agency_name=agency.name, - state=agency.state, - county=agency.county, - locality=agency.locality - ) - results.append(agency_info) - return results - - - @staticmethod - def final_review_annotation_agency_info( - # TODO: Revise - automated_agency_suggestions: list[None], - confirmed_agencies: list[LinkURLAgency], - user_agency_suggestion: UserUrlAgencySuggestion - ): - - confirmed_agency_info = DTOConverter.confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies - ) - - # TODO: Revise - # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - # automated_agency_suggestions - # ) - agency_auto_info = None - - agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestion - ) - - return FinalReviewAnnotationAgencyInfo( - confirmed=confirmed_agency_info, - user=agency_user_info, - auto=agency_auto_info - ) @staticmethod diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 9548136d..7411f934 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -53,6 +53,9 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # TODO: Revise # automated_agency_suggestions = relationship( # "AutomatedUrlAgencySuggestion", back_populates="url") + auto_agency_suggestions = relationship( + "URLAutoAgencyIDSubtask" + ) user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py index ec04d471..8066b199 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -1,3 +1,5 @@ +from sqlalchemy.orm import relationship + from src.db.models.helpers import enum_column from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, TaskDependentMixin @@ -25,4 +27,9 @@ class URLAutoAgencyIDSubtask( detail = enum_column( SubtaskDetailCode, name="agency_id_subtask_detail_code", + ) + + suggestions = relationship( + "AgencyIDSubtaskSuggestion", + cascade="all, delete-orphan" ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index 0bc956fd..dcf42ab6 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -1,3 +1,5 @@ +from sqlalchemy.orm import relationship + from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin from src.db.models.templates_.base import Base @@ -21,4 +23,6 @@ class AgencyIDSubtaskSuggestion( "confidence BETWEEN 0 and 100" ), nullable=False, - ) \ No newline at end of file + ) + + agency = relationship("Agency") \ No newline at end of file diff --git a/src/db/models/views/has_agency_auto_suggestion.py b/src/db/models/views/has_agency_auto_suggestion.py deleted file mode 100644 index c72b9fd3..00000000 --- a/src/db/models/views/has_agency_auto_suggestion.py +++ /dev/null @@ -1,31 +0,0 @@ -""" - CREATE OR REPLACE VIEW url_has_agency_auto_suggestions_view AS - SELECT - u.id as url_id, - (uas.id IS NOT NULL) AS has_agency_suggestions - FROM public.urls u - LEFT JOIN public.url_auto_agency_id_subtasks uas on u.id = uas.url_id -""" - - -from sqlalchemy import Column, Boolean, PrimaryKeyConstraint -from sqlalchemy.orm import Mapped - -from src.db.models.mixins import URLDependentMixin, ViewMixin -from src.db.models.templates_.base import Base - - -class HasAgencyAutoSuggestionView( - Base, - URLDependentMixin, - ViewMixin -): - - __tablename__ = "url_has_agency_auto_suggestions_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) - - has_agency_suggestions: Mapped[bool] = Column(Boolean, nullable=False) - diff --git a/tests/automated/integration/db/structure/test_view.py b/tests/automated/integration/db/structure/test_view.py deleted file mode 100644 index 08a5d57c..00000000 --- a/tests/automated/integration/db/structure/test_view.py +++ /dev/null @@ -1,70 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import BatchStatus -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.db.models.exceptions import WriteToViewError -from src.db.models.impl.task.core import Task -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode -from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.views.has_agency_auto_suggestion import HasAgencyAutoSuggestionView - -@pytest.mark.asyncio -async def test_has_agency_auto_suggestion_view( - adb_client_test: AsyncDatabaseClient -) -> None: - """Test functionality of agency auto suggestion view and view logic in general.""" - - view_objects: list[HasAgencyAutoSuggestionView] = \ - await adb_client_test.get_all(HasAgencyAutoSuggestionView) - - assert len(view_objects) == 0 - - url = URL( - url="https://example.com/1", - status=URLStatus.OK, - source=URLSource.COLLECTOR - ) - url_id: int = await adb_client_test.add(url, return_id=True) - - view_objects: list[HasAgencyAutoSuggestionView] = \ - await adb_client_test.get_all(HasAgencyAutoSuggestionView) - - assert len(view_objects) == 1 - assert view_objects[0].url_id == url_id - assert view_objects[0].has_agency_suggestions is False - - - task = Task( - task_type=TaskType.HTML.value, - task_status=BatchStatus.READY_TO_LABEL, - ) - task_id: int = await adb_client_test.add(task, return_id=True) - - subtask = URLAutoAgencyIDSubtask( - task_id=task_id, - url_id=url_id, - subtask=AutoAgencyIDSubtaskType.CKAN, - agencies_found=False, - detail=SubtaskDetailCode.RETRIEVAL_ERROR - ) - await adb_client_test.add(subtask) - - view_objects: list[HasAgencyAutoSuggestionView] = \ - await adb_client_test.get_all(HasAgencyAutoSuggestionView) - - assert len(view_objects) == 1 - assert view_objects[0].url_id == url_id - assert view_objects[0].has_agency_suggestions is True - - - view_obj_to_add = HasAgencyAutoSuggestionView( - url_id=1, - has_agency_suggestions=True - ) - - with pytest.raises(WriteToViewError): - await adb_client_test.add(view_obj_to_add) \ No newline at end of file From 0dfb27256c3f70bb0763f31a86ebaf74a1412ede Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 4 Sep 2025 08:57:44 -0400 Subject: [PATCH 18/33] Continue Draft --- .../queries/agency_suggestion_/__init__.py | 0 .../core.py} | 0 .../suggestions_with_highest_confidence.py | 0 .../agency/get/queries/next_for_annotation.py | 2 +- src/api/endpoints/annotate/all/get/query.py | 2 +- .../review/next/{query.py => core.py} | 60 +++++++------------ src/api/endpoints/review/next/extract.py | 23 +++++++ .../endpoints/review/next/queries/__init__.py | 0 .../review/next/queries/count_reviewed.py | 18 ++++++ .../review/next/templates/__init__.py | 0 .../review/next/templates/count_cte.py | 15 +++++ src/db/client/async_.py | 2 +- src/db/models/impl/agency/sqlalchemy.py | 3 +- src/db/models/impl/url/core/sqlalchemy.py | 5 +- .../core/common/annotation_exists.py | 2 +- 15 files changed, 83 insertions(+), 49 deletions(-) create mode 100644 src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py rename src/api/endpoints/annotate/agency/get/queries/{agency_suggestion.py => agency_suggestion_/core.py} (100%) create mode 100644 src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py rename src/api/endpoints/review/next/{query.py => core.py} (83%) create mode 100644 src/api/endpoints/review/next/extract.py create mode 100644 src/api/endpoints/review/next/queries/__init__.py create mode 100644 src/api/endpoints/review/next/queries/count_reviewed.py create mode 100644 src/api/endpoints/review/next/templates/__init__.py create mode 100644 src/api/endpoints/review/next/templates/count_cte.py diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py similarity index 100% rename from src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py rename to src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 5fd8cea9..e8fdc6b2 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -4,7 +4,7 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAnnotationInnerResponse -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index dbda0f8b..8a33b79f 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import selectinload from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/core.py similarity index 83% rename from src/api/endpoints/review/next/query.py rename to src/api/endpoints/review/next/core.py index 9e87737c..d9ac3d67 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/core.py @@ -7,6 +7,9 @@ from src.api.endpoints.review.next.convert import convert_agency_info_to_final_review_annotation_agency_info from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo +from src.api.endpoints.review.next.extract import extract_html_content_infos, extract_optional_metadata +from src.api.endpoints.review.next.queries.count_reviewed import COUNT_REVIEWED_CTE +from src.api.endpoints.review.next.templates.count_cte import CountCTE from src.collectors.enums import URLStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.constants import USER_ANNOTATION_MODELS @@ -18,6 +21,8 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase @@ -43,11 +48,16 @@ def __init__(self, batch_id: int | None = None): ] # The below relationships are joined to entities that are joined to the URL self.double_join_relationships = [ - # TODO: Replace with new logic - # (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, LinkURLAgency.agency) ] + self.triple_join_relationships = [ + ( + URL.auto_agency_subtasks, + URLAutoAgencyIDSubtask.suggestions, + AgencyIDSubtaskSuggestion.agency + ) + ] self.count_label = "count" @@ -126,6 +136,10 @@ async def _apply_options( *[ joinedload(primary).joinedload(secondary) for primary, secondary in self.double_join_relationships + ], + *[ + joinedload(primary).joinedload(secondary).joinedload(tertiary) + for primary, secondary, tertiary in self.triple_join_relationships ] ) @@ -135,40 +149,23 @@ async def _apply_order_clause(self, url_query: Select): asc(URL.id) ) - async def _extract_html_content_infos(self, url: URL) -> list[URLHTMLContentInfo]: - html_content = url.html_content - html_content_infos = [ - URLHTMLContentInfo(**html_info.__dict__) - for html_info in html_content - ] - return html_content_infos - - async def _extract_optional_metadata(self, url: URL) -> FinalReviewOptionalMetadata: - if url.optional_data_source_metadata is None: - return FinalReviewOptionalMetadata() - return FinalReviewOptionalMetadata( - record_formats=url.optional_data_source_metadata.record_formats, - data_portal_type=url.optional_data_source_metadata.data_portal_type, - supplying_entity=url.optional_data_source_metadata.supplying_entity - ) - async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | None: if self.batch_id is None: return None - count_reviewed_query = await self.get_count_reviewed_query() + count_reviewed_query: CountCTE = COUNT_REVIEWED_CTE count_ready_query = await self.get_count_ready_query() full_query = ( select( - func.coalesce(count_reviewed_query.c[self.count_label], 0).label("count_reviewed"), + func.coalesce(count_reviewed_query.count, 0).label("count_reviewed"), func.coalesce(count_ready_query.c[self.count_label], 0).label("count_ready_for_review") ) .select_from( count_ready_query.outerjoin( count_reviewed_query, - count_reviewed_query.c.batch_id == count_ready_query.c.batch_id + count_reviewed_query.batch_id == count_ready_query.c.batch_id ) ) ) @@ -201,21 +198,6 @@ async def get_count_ready_query(self): ) return count_ready_query - async def get_count_reviewed_query(self): - count_reviewed_query = ( - select( - Batch.id.label("batch_id"), - func.count(FlagURLValidated.url_id).label(self.count_label) - ) - .select_from(Batch) - .join(LinkBatchURL) - .outerjoin(FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id) - - .group_by(Batch.id) - .subquery("count_reviewed") - ) - return count_reviewed_query - async def run( self, session: AsyncSession @@ -243,8 +225,8 @@ async def run( result: URL = row[0] - html_content_infos = await self._extract_html_content_infos(result) - optional_metadata = await self._extract_optional_metadata(result) + html_content_infos: list[URLHTMLContentInfo] = await extract_html_content_infos(result) + optional_metadata: FinalReviewOptionalMetadata = await extract_optional_metadata(result) batch_info = await self.get_batch_info(session) try: diff --git a/src/api/endpoints/review/next/extract.py b/src/api/endpoints/review/next/extract.py new file mode 100644 index 00000000..aca642e0 --- /dev/null +++ b/src/api/endpoints/review/next/extract.py @@ -0,0 +1,23 @@ +from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.core.sqlalchemy import URL + + +async def extract_html_content_infos( + url: URL +)-> list[URLHTMLContentInfo]: + html_content = url.html_content + html_content_infos = [ + URLHTMLContentInfo(**html_info.__dict__) + for html_info in html_content + ] + return html_content_infos + +async def extract_optional_metadata(url: URL) -> FinalReviewOptionalMetadata: + if url.optional_data_source_metadata is None: + return FinalReviewOptionalMetadata() + return FinalReviewOptionalMetadata( + record_formats=url.optional_data_source_metadata.record_formats, + data_portal_type=url.optional_data_source_metadata.data_portal_type, + supplying_entity=url.optional_data_source_metadata.supplying_entity + ) \ No newline at end of file diff --git a/src/api/endpoints/review/next/queries/__init__.py b/src/api/endpoints/review/next/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/queries/count_reviewed.py b/src/api/endpoints/review/next/queries/count_reviewed.py new file mode 100644 index 00000000..c9bf52bb --- /dev/null +++ b/src/api/endpoints/review/next/queries/count_reviewed.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.api.endpoints.review.next.templates.count_cte import CountCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +COUNT_REVIEWED_CTE = CountCTE( + select( + Batch.id.label("batch_id"), + func.count(FlagURLValidated.url_id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .outerjoin(FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id) + .group_by(Batch.id) + .cte("count_reviewed") +) \ No newline at end of file diff --git a/src/api/endpoints/review/next/templates/__init__.py b/src/api/endpoints/review/next/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/templates/count_cte.py b/src/api/endpoints/review/next/templates/count_cte.py new file mode 100644 index 00000000..0abbbab4 --- /dev/null +++ b/src/api/endpoints/review/next/templates/count_cte.py @@ -0,0 +1,15 @@ +from sqlalchemy import CTE, Column + + +class CountCTE: + + def __init__(self, cte: CTE): + self.cte = cte + + @property + def batch_id(self) -> Column[int]: + return self.cte.c['batch_id'] + + @property + def count(self) -> Column[int]: + return self.cte.c['count'] \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93ec996c..a028d404 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -42,7 +42,7 @@ from src.api.endpoints.review.approve.query_.core import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse -from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder +from src.api.endpoints.review.next.core import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 9477ecef..032dc397 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -25,7 +25,6 @@ class Agency( locality = Column(String, nullable=True) # Relationships - # TODO: Revise - # automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") + automated_suggestions = relationship("AgencyIDSubtaskSuggestion") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 7411f934..82b337b0 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -50,10 +50,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): secondary="link_task_urls", back_populates="urls", ) - # TODO: Revise - # automated_agency_suggestions = relationship( - # "AutomatedUrlAgencySuggestion", back_populates="url") - auto_agency_suggestions = relationship( + auto_agency_subtasks = relationship( "URLAutoAgencyIDSubtask" ) user_agency_suggestion = relationship( diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index c84f54f1..bf1c07a1 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -30,7 +30,7 @@ class AnnotationExistsCTEQueryBuilder(QueryBuilderBase): def url_id(self): return self.query.c.url_id - def get_exists_label(self, model: Type[URLDependentMixin]): + def get_exists_label(self, model: Type[URLDependentMixin]) -> str: return f"{model.__name__}_exists" def get_all(self) -> list[Any]: From db770beb608513b736d16d29fd997d3a16e06c0e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 5 Sep 2025 06:57:52 -0400 Subject: [PATCH 19/33] Update Draft --- ...aee0dd79_overhaul_agency_identification.py | 26 ++- .../get/queries/agency_suggestion_/core.py | 91 ++++++----- .../suggestions_with_highest_confidence.py | 62 ++++++++ src/api/endpoints/annotate/all/get/query.py | 2 +- src/api/endpoints/review/next/convert.py | 35 +++-- src/api/endpoints/review/next/core.py | 8 +- .../review/next/queries/count_reviewed.py | 2 +- src/core/tasks/url/loader.py | 3 +- .../operators/agency_identification/core.py | 26 +-- .../agency_identification/dtos/tdo.py | 11 -- ...pending_urls_without_agency_suggestions.py | 38 ----- .../has_urls_without_agency_suggestions.py | 27 ---- .../agency_identification/subtasks/convert.py | 2 +- .../subtasks/impl/ckan_/core.py | 7 +- .../subtasks/impl/ckan_/query.py | 51 ++++++ .../subtasks/impl/muckrock_/core.py | 11 +- .../subtasks/impl/muckrock_/query.py | 55 +++++++ .../impl/nlp_location_match_/convert.py | 2 +- .../subtasks/impl/unknown.py | 30 ---- .../agency_identification/subtasks/loader.py | 50 +++--- .../subtasks/models/run_info.py | 1 + .../subtasks/planner/queries/core.py | 26 --- .../subtasks/planner/reconcile.py | 23 --- .../queries/survey}/__init__.py | 0 .../{planner => queries/survey}/constants.py | 7 +- .../{planner => queries/survey}/core.py | 14 +- .../survey/queries}/__init__.py | 0 .../subtasks/queries/survey/queries/core.py | 57 +++++++ .../survey}/queries/ctes/README.md | 0 .../survey/queries/ctes}/__init__.py | 0 .../queries/survey/queries/ctes/eligible.py | 30 ++++ .../survey/queries/ctes/exists}/__init__.py | 0 .../survey/queries/ctes/exists/container.py} | 17 +- .../queries/ctes/exists/impl/__init__.py} | 0 .../impl/high_confidence_annotations.py | 29 ++++ .../queries/ctes/exists/impl/validated.py | 16 ++ .../survey/queries/ctes/subtask/__init__.py} | 0 .../survey/queries/ctes/subtask/container.py | 40 +++++ .../survey/queries/ctes/subtask/helpers.py | 18 +++ .../queries/ctes/subtask/impl/__init__.py} | 0 .../survey/queries/ctes/subtask/impl/ckan.py | 37 +++++ .../queries/ctes/subtask/impl/homepage.py | 99 ++++++++++++ .../queries/ctes/subtask/impl/muckrock.py | 40 +++++ .../queries/ctes/subtask/impl/nlp_location.py | 26 +++ .../queries/survey/queries/eligible_counts.py | 22 +++ .../subtasks/templates/subtask.py | 8 +- src/db/client/async_.py | 13 +- src/db/constants.py | 12 -- .../url/suggestion/agency/subtask/pydantic.py | 5 +- .../suggestion/agency/subtask/sqlalchemy.py | 2 +- .../suggestion/agency/suggestion/pydantic.py | 7 + .../agency/suggestion/sqlalchemy.py | 6 +- src/db/models/views/meta_url.py | 26 +++ .../common/annotation_exists_/__init__.py} | 0 .../common/annotation_exists_/constants.py | 15 ++ .../core.py} | 2 +- .../core/metrics/urls/aggregated/pending.py | 2 +- .../summaries/test_pending_url_filter.py | 7 +- .../api/metrics/batches/test_aggregated.py | 8 +- .../api/metrics/batches/test_breakdown.py | 10 +- .../integration/api/metrics/test_backlog.py | 13 +- .../api/metrics/urls/aggregated/test_core.py | 20 ++- .../api/review/test_batch_filtering.py | 4 +- .../core/async_/run_task/test_break_loop.py | 4 +- .../core/async_/run_task/test_prereq_met.py | 4 +- .../{happy_path => }/conftest.py | 11 +- .../happy_path/test_happy_path.py | 128 --------------- .../subtasks/ckan/__init__.py | 0 .../subtasks/ckan/test_core.py | 100 ++++++++++++ .../subtasks/homepage_match/__init__.py | 0 .../subtasks/homepage_match/test_core.py | 6 + .../subtasks/muckrock/__init__.py | 0 .../subtasks/muckrock/test_core.py | 148 ++++++++++++++++++ .../subtasks/nlp_location_match/__init__.py | 0 .../subtasks/nlp_location_match/test_core.py | 6 + .../subtasks/test_ckan.py | 58 ------- .../subtasks/test_muckrock.py | 80 ---------- .../subtasks/test_unknown.py | 16 -- .../data_creator/commands/impl/annotate.py | 2 +- .../impl/suggestion/auto/agency_/__init__.py | 0 .../auto/{agency.py => agency_/core.py} | 13 +- tests/helpers/data_creator/core.py | 4 +- tests/helpers/data_creator/create.py | 2 + tests/helpers/data_creator/generate.py | 2 + 84 files changed, 1169 insertions(+), 616 deletions(-) delete mode 100644 src/core/tasks/url/operators/agency_identification/dtos/tdo.py delete mode 100644 src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py delete mode 100644 src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py rename src/core/tasks/url/operators/agency_identification/{queries => subtasks/queries/survey}/__init__.py (100%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner => queries/survey}/constants.py (72%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner => queries/survey}/core.py (53%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner => queries/survey/queries}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py rename src/core/tasks/url/operators/agency_identification/subtasks/{planner => queries/survey}/queries/ctes/README.md (100%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries => queries/survey/queries/ctes}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries/ctes => queries/survey/queries/ctes/exists}/__init__.py (100%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries/ctes/base.py => queries/survey/queries/ctes/exists/container.py} (52%) rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries/ctes/ckan.py => queries/survey/queries/ctes/exists/impl/__init__.py} (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries/ctes/homepage.py => queries/survey/queries/ctes/subtask/__init__.py} (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py rename src/core/tasks/url/operators/agency_identification/subtasks/{planner/queries/ctes/muckrock.py => queries/survey/queries/ctes/subtask/impl/__init__.py} (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py create mode 100644 src/db/models/views/meta_url.py rename src/{core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py => db/queries/implementations/core/common/annotation_exists_/__init__.py} (100%) create mode 100644 src/db/queries/implementations/core/common/annotation_exists_/constants.py rename src/db/queries/implementations/core/common/{annotation_exists.py => annotation_exists_/core.py} (96%) rename tests/automated/integration/tasks/url/impl/agency_identification/{happy_path => }/conftest.py (79%) delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py rename src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py => tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/__init__.py (100%) create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py create mode 100644 tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py rename tests/helpers/data_creator/commands/impl/suggestion/auto/{agency.py => agency_/core.py} (84%) diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index a58c5e56..702774d5 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -25,6 +25,8 @@ URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_id_subtasks" LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "agency_id_subtask_suggestions" +META_URL_VIEW_NAME: str = "meta_url_view" + URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.dialects.postgresql.ENUM( @@ -42,23 +44,44 @@ ) + + + def upgrade() -> None: _create_url_auto_agency_subtask_table() _create_url_unknown_agencies_view() + _create_meta_url_view() _create_link_agency_id_subtask_agencies_table() _create_new_url_annotation_flags_view() _drop_url_auto_agency_suggestions_table() + + def downgrade() -> None: _drop_url_unknown_agencies_view() _create_url_auto_agency_suggestions_table() _create_old_url_annotation_flags_view() _drop_link_agency_id_subtask_agencies_table() _drop_url_auto_agency_subtask_table() + _drop_meta_url_view() SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) +def _drop_meta_url_view(): + op.execute(f"DROP VIEW IF EXISTS {META_URL_VIEW_NAME}") + + +def _create_meta_url_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {META_URL_VIEW_NAME} AS + SELECT + urls.id as url_id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' + """) + def _drop_url_auto_agency_suggestions_table(): op.drop_table(URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME) @@ -105,7 +128,7 @@ def _create_url_auto_agency_subtask_table(): task_id_column(), url_id_column(), sa.Column( - "subtask", + "type", AGENCY_AUTO_SUGGESTION_METHOD_ENUM, nullable=False ), @@ -127,6 +150,7 @@ def _create_url_auto_agency_subtask_table(): def _create_link_agency_id_subtask_agencies_table(): op.create_table( LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME, + id_column(), sa.Column( "subtask_id", sa.Integer(), diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py index 52c58c40..74740591 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py @@ -1,8 +1,16 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.suggestions_with_highest_confidence import \ + SuggestionsWithHighestConfidenceCTE +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh class GetAgencySuggestionsQueryBuilder(QueryBuilderBase): @@ -15,38 +23,51 @@ def __init__( async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: # Get relevant autosuggestions and agency info, if an associated agency exists - raise NotImplementedError("Revise") - - # statement = ( - # select( - # AutomatedUrlAgencySuggestion.agency_id, - # AutomatedUrlAgencySuggestion.is_unknown, - # Agency.name, - # Agency.state, - # Agency.county, - # Agency.locality - # ) - # .join(Agency, isouter=True) - # .where(AutomatedUrlAgencySuggestion.url_id == self.url_id) - # ) - # raw_autosuggestions = await session.execute(statement) - # autosuggestions = raw_autosuggestions.all() - # agency_suggestions = [] - # for autosuggestion in autosuggestions: - # agency_id = autosuggestion[0] - # is_unknown = autosuggestion[1] - # name = autosuggestion[2] - # state = autosuggestion[3] - # county = autosuggestion[4] - # locality = autosuggestion[5] - # agency_suggestions.append( - # GetNextURLForAgencyAgencyInfo( - # suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, - # pdap_agency_id=agency_id, - # agency_name=name, - # state=state, - # county=county, - # locality=locality - # ) - # ) - # return agency_suggestions \ No newline at end of file + + cte = SuggestionsWithHighestConfidenceCTE() + + query = ( + select( + cte.agency_id, + cte.confidence, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .outerjoin( + Agency, + Agency.id == cte.agency_id + ) + .where( + cte.url_id == self.url_id + ) + ) + + raw_autosuggestions: Sequence[RowMapping] = await sh.mappings(session, query=query) + if len(raw_autosuggestions) == 0: + # Unknown agency + return [ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + + agency_suggestions: list[GetNextURLForAgencyAgencyInfo] = [] + for autosuggestion in raw_autosuggestions: + agency_id: int = autosuggestion["agency_id"] + name: str = autosuggestion["name"] + state: str | None = autosuggestion["state"] + county: str | None = autosuggestion["county"] + locality: str | None = autosuggestion["locality"] + agency_suggestions.append( + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + ) + ) + return agency_suggestions \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py index e69de29b..6d389b11 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py @@ -0,0 +1,62 @@ +from sqlalchemy import CTE, select, func, Column + +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +SUGGESTIONS_WITH_HIGHEST_CONFIDENCE_CTE: CTE = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence) + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") +) + +class SuggestionsWithHighestConfidenceCTE: + + def __init__(self): + self._cte = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence).label("confidence") + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .where( + AgencyIDSubtaskSuggestion.agency_id.isnot(None) + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.agency_id + + @property + def confidence(self) -> Column[float]: + return self._cte.columns.confidence \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index 8a33b79f..05855578 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -50,7 +50,7 @@ async def run( load_options = [ URL.html_content, - URL.automated_agency_suggestions, + URL.auto_agency_subtasks, URL.auto_relevant_suggestion, URL.auto_record_type_suggestion ] diff --git a/src/api/endpoints/review/next/convert.py b/src/api/endpoints/review/next/convert.py index ba443a8f..962b7e1e 100644 --- a/src/api/endpoints/review/next/convert.py +++ b/src/api/endpoints/review/next/convert.py @@ -1,5 +1,5 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo -from src.api.endpoints.review.next.dto import FinalReviewAnnotationAgencyInfo +from src.api.endpoints.review.next.dto import FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyAutoInfo from src.core.enums import SuggestionType from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -8,23 +8,27 @@ def convert_agency_info_to_final_review_annotation_agency_info( - automated_agency_suggestions: list[None], + subtasks: list[URLAutoAgencyIDSubtask], confirmed_agencies: list[LinkURLAgency], user_agency_suggestion: UserUrlAgencySuggestion ) -> FinalReviewAnnotationAgencyInfo: - confirmed_agency_info = _convert_confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies + confirmed_agency_info: list[GetNextURLForAgencyAgencyInfo] = ( + _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies + ) ) - # TODO: Revise - # agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - # automated_agency_suggestions - # ) - agency_auto_info = None + agency_auto_info: FinalReviewAnnotationAgencyAutoInfo = ( + _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks + ) + ) - agency_user_info = _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestion + agency_user_info: GetNextURLForAgencyAgencyInfo | None = ( + _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestion + ) ) return FinalReviewAnnotationAgencyInfo( @@ -76,10 +80,12 @@ def _convert_agency_to_get_next_url_for_agency_agency_info( def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( subtasks: list[URLAutoAgencyIDSubtask] -) -> list[GetNextURLForAgencyAgencyInfo]: +) -> FinalReviewAnnotationAgencyAutoInfo: results: list[GetNextURLForAgencyAgencyInfo] = [] + count_agencies_not_found: int = 0 for subtask in subtasks: if not subtask.agencies_found: + count_agencies_not_found += 1 continue for suggestion in subtask.suggestions: info: GetNextURLForAgencyAgencyInfo = _convert_agency_to_get_next_url_for_agency_agency_info( @@ -87,4 +93,7 @@ def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_ agency=suggestion.agency ) results.append(info) - return results \ No newline at end of file + return FinalReviewAnnotationAgencyAutoInfo( + unknown=count_agencies_not_found == len(subtasks), + suggestions=results + ) diff --git a/src/api/endpoints/review/next/core.py b/src/api/endpoints/review/next/core.py index d9ac3d67..6fb6c95d 100644 --- a/src/api/endpoints/review/next/core.py +++ b/src/api/endpoints/review/next/core.py @@ -16,8 +16,6 @@ from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL @@ -26,7 +24,7 @@ from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL = "total_distinct_annotation_count" @@ -164,7 +162,7 @@ async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | ) .select_from( count_ready_query.outerjoin( - count_reviewed_query, + count_reviewed_query.cte, count_reviewed_query.batch_id == count_ready_query.c.batch_id ) ) @@ -247,7 +245,7 @@ async def run( auto_suggestion=result.auto_record_type_suggestion ), agency=convert_agency_info_to_final_review_annotation_agency_info( - automated_agency_suggestions=result.automated_agency_suggestions, + subtasks=result.auto_agency_subtasks, user_agency_suggestion=result.user_agency_suggestion, confirmed_agencies=result.confirmed_agencies ) diff --git a/src/api/endpoints/review/next/queries/count_reviewed.py b/src/api/endpoints/review/next/queries/count_reviewed.py index c9bf52bb..91349cb5 100644 --- a/src/api/endpoints/review/next/queries/count_reviewed.py +++ b/src/api/endpoints/review/next/queries/count_reviewed.py @@ -5,7 +5,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -COUNT_REVIEWED_CTE = CountCTE( +COUNT_REVIEWED_CTE: CountCTE = CountCTE( select( Batch.id.label("batch_id"), func.count(FlagURLValidated.url_id).label("count") diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 45f750af..8b5a18c1 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -79,7 +79,8 @@ async def _get_agency_identification_task_operator(self) -> URLTaskEntry: adb_client=self.adb_client, loader=AgencyIdentificationSubtaskLoader( pdap_client=self.pdap_client, - muckrock_api_interface=self.muckrock_api_interface + muckrock_api_interface=self.muckrock_api_interface, + adb_client=self.adb_client ) ) return URLTaskEntry( diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 9c2e00f4..f5a84061 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,7 +1,9 @@ +from src.core.tasks.mixins.link_urls import LinkURLsMixin from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo -from src.core.tasks.url.operators.agency_identification.subtasks.planner.core import AgencyIDSubtaskPlanner +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ + AgencyIDSubtaskSurveyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient @@ -9,18 +11,19 @@ from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -class AgencyIdentificationTaskOperator(URLTaskOperatorBase): +class AgencyIdentificationTaskOperator( + URLTaskOperatorBase, + LinkURLsMixin +): def __init__( self, adb_client: AsyncDatabaseClient, loader: AgencyIdentificationSubtaskLoader, - planner: AgencyIDSubtaskPlanner, ): super().__init__(adb_client) self.loader = loader self._subtask: AutoAgencyIDSubtaskType | None = None - self.planner = planner @property def task_type(self) -> TaskType: @@ -31,10 +34,13 @@ async def meets_task_prerequisites(self) -> bool: Modifies: - self._subtask """ - subtask_type: AutoAgencyIDSubtaskType | None = await self.planner.plan_next_subtask() - if subtask_type is None: + next_subtask: AutoAgencyIDSubtaskType | None = \ + await self.adb_client.run_query_builder( + AgencyIDSubtaskSurveyQueryBuilder() + ) + self._subtask = next_subtask + if next_subtask is None: return False - self._subtask = subtask_type return True @@ -43,10 +49,7 @@ async def load_subtask( subtask_type: AutoAgencyIDSubtaskType ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - return await self.loader.load_subtask(subtask_type) - - async def plan_next_subtask(self) -> AutoAgencyIDSubtaskType | None: - return await self.planner.plan_next_subtask() + return await self.loader.load_subtask(subtask_type, task_id=self.task_id) @staticmethod async def run_subtask( @@ -57,6 +60,7 @@ async def run_subtask( async def inner_task_logic(self) -> None: subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) + await self.link_urls_to_task(run_info.linked_url_ids) if not run_info.is_success: raise SubtaskError(run_info.error) diff --git a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py b/src/core/tasks/url/operators/agency_identification/dtos/tdo.py deleted file mode 100644 index 72f24d97..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from src.collectors.enums import CollectorType - - -class AgencyIdentificationTDO(BaseModel): - url_id: int - collector_metadata: dict | None = None - collector_type: CollectorType | None diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py deleted file mode 100644 index b3280cf2..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ /dev/null @@ -1,38 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus, CollectorType -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetPendingURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: - - statement = ( - select( - URL.id, - URL.collector_metadata, - Batch.strategy - ) - .select_from(URL) - .where(URL.status == URLStatus.OK.value) - .outerjoin(LinkBatchURL) - .outerjoin(Batch) - ) - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - statement = statement.limit(100) - raw_results = await session.execute(statement) - return [ - AgencyIdentificationTDO( - url_id=raw_result[0], - collector_metadata=raw_result[1], - collector_type=CollectorType(raw_result[2]) if raw_result[2] is not None else None - ) - for raw_result in raw_results - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py deleted file mode 100644 index 9877675b..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py +++ /dev/null @@ -1,27 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class HasURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> bool: - statement = ( - select( - URL.id - ).where( - URL.status == URLStatus.OK.value - ) - ) - - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - raw_result = await session.execute(statement) - result = raw_result.all() - return len(result) != 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py index 976e6e4a..95c9e704 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -19,7 +19,7 @@ def convert_match_agency_response_to_subtask_data( agencies_found: bool = len(suggestions) > 0 subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( url_id=url_id, - subtask=subtask_type, + type=subtask_type, agencies_found=agencies_found, task_id=task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py index 925411f1..d1af5391 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -5,6 +5,8 @@ from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ convert_match_agency_response_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ + GetCKANAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ AgencyIDSubtaskOperatorBase @@ -29,6 +31,7 @@ def __init__( @override async def inner_logic(self) -> None: params: list[CKANAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for param in params: agency_name: str = param.collector_metadata["agency_name"] @@ -46,4 +49,6 @@ async def inner_logic(self) -> None: await self._upload_subtask_data(subtask_data_list) async def _get_params(self) -> list[CKANAgencyIDSubtaskParams]: - raise NotImplementedError \ No newline at end of file + return await self.adb_client.run_query_builder( + GetCKANAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py index e69de29b..86160a10 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py @@ -0,0 +1,51 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetCKANAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[CKANAgencyIDSubtaskParams]: + query = ( + select( + URL.id, + URL.collector_metadata + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy.in_( + ( + CollectorType.CKAN.value, + ) + ), + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + CKANAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py index 28ee8f29..4fa92c2e 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -9,6 +9,8 @@ convert_match_agency_response_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ + GetMuckrockAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient @@ -35,6 +37,7 @@ def __init__( @override async def inner_logic(self) -> None: params: list[MuckrockAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for param in params: muckrock_agency_id: int = param.collector_metadata["agency"] @@ -55,7 +58,7 @@ async def inner_logic(self) -> None: subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( url_id=param.url_id, response=match_agency_response, - subtask_type=AutoAgencyIDSubtaskType.CKAN, + subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, task_id=self.task_id ) subtask_data_list.append(subtask_data) @@ -72,7 +75,7 @@ async def _error_subtask_data( pydantic_model = URLAutoAgencyIDSubtaskPydantic( task_id=self.task_id, url_id=url_id, - subtask=AutoAgencyIDSubtaskType.MUCKROCK, + type=AutoAgencyIDSubtaskType.MUCKROCK, agencies_found=False, detail=SubtaskDetailCode.RETRIEVAL_ERROR ) @@ -85,4 +88,6 @@ async def _error_subtask_data( ) async def _get_params(self) -> list[MuckrockAgencyIDSubtaskParams]: - raise NotImplementedError \ No newline at end of file + return await self.adb_client.run_query_builder( + GetMuckrockAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py index e69de29b..5c292f37 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py @@ -0,0 +1,55 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetMuckrockAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[MuckrockAgencyIDSubtaskParams]: + query = ( + select( + URL.id, + URL.collector_metadata + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy.in_( + ( + CollectorType.MUCKROCK_ALL_SEARCH.value, + CollectorType.MUCKROCK_COUNTY_SEARCH.value, + CollectorType.MUCKROCK_SIMPLE_SEARCH.value, + ) + ), + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + MuckrockAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py index d2f14477..64f299fe 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py @@ -53,7 +53,7 @@ def convert_search_agency_response_to_subtask_data( pydantic_model = URLAutoAgencyIDSubtaskPydantic( task_id=task_id, url_id=url_id, - subtask=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, agencies_found=len(suggestions) > 0 ) return AutoAgencyIDSubtaskData( diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py deleted file mode 100644 index cd741c5b..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing_extensions import override, final - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase - -@final -class UnknownAgencyIdentificationSubtask(AgencyIDSubtaskOperatorBase): - """A subtask that returns an unknown suggestion. - - Used in cases where the agency cannot be reliably inferred from the source. - """ - - @override - async def inner_logic( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ) - ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 493a94d2..31c6fbec 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,6 +1,5 @@ import spacy -from src.collectors.enums import CollectorType from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.core import \ @@ -11,7 +10,6 @@ NLPLocationMatchSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.core import \ NLPProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType @@ -27,55 +25,55 @@ def __init__( muckrock_api_interface: MuckrockAPIInterface, adb_client: AsyncDatabaseClient ): - self.pdap_client = pdap_client - self.muckrock_api_interface = muckrock_api_interface + self._pdap_client = pdap_client + self._muckrock_api_interface = muckrock_api_interface self.adb_client = adb_client - async def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: + def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: return MuckrockAgencyIDSubtaskOperator( task_id=task_id, adb_client=self.adb_client, - muckrock_api_interface=self.muckrock_api_interface, - pdap_client=self.pdap_client + muckrock_api_interface=self._muckrock_api_interface, + pdap_client=self._pdap_client ) - async def _load_ckan_subtask(self, task_id: int) -> CKANAgencyIDSubtaskOperator: + def _load_ckan_subtask(self, task_id: int) -> CKANAgencyIDSubtaskOperator: return CKANAgencyIDSubtaskOperator( task_id=task_id, adb_client=self.adb_client, - pdap_client=self.pdap_client + pdap_client=self._pdap_client ) - async def _load_homepage_match_subtask(self, task_id: int) -> HomepageMatchSubtaskOperator: + def _load_homepage_match_subtask(self, task_id: int) -> HomepageMatchSubtaskOperator: return HomepageMatchSubtaskOperator( task_id=task_id, adb_client=self.adb_client, ) - async def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubtaskOperator: + def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubtaskOperator: return NLPLocationMatchSubtaskOperator( task_id=task_id, adb_client=self.adb_client, - pdap_client=self.pdap_client, + pdap_client=self._pdap_client, processor=NLPProcessor( spacy.load('en_core_web_trf', disable=['parser']) ) ) - async def load_subtask(self, subtask_type: AutoAgencyIDSubtaskType) -> AgencyIDSubtaskOperatorBase: + async def load_subtask( + self, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" match subtask_type: - case CollectorType.MUCKROCK_SIMPLE_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_COUNTY_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_ALL_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.AUTO_GOOGLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.COMMON_CRAWLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.CKAN: - return await self._load_ckan_subtask() - return UnknownAgencyIdentificationSubtask() \ No newline at end of file + case AutoAgencyIDSubtaskType.MUCKROCK: + return self._load_muckrock_subtask(task_id) + case AutoAgencyIDSubtaskType.CKAN: + return self._load_ckan_subtask(task_id) + case AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: + return self._load_muckrock_subtask(task_id) + case AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: + return self._load_homepage_match_subtask(task_id) + raise ValueError(f"Unknown subtask type: {subtask_type}") diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py index 59db69e6..b2ee3e28 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py @@ -3,6 +3,7 @@ class AgencyIDSubtaskRunInfo(BaseModel): error: str | None = None + linked_url_ids: list[int] | None = None @property def is_success(self) -> bool: diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py deleted file mode 100644 index 7765612d..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/core.py +++ /dev/null @@ -1,26 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.agency_identification.subtasks.planner.constants import SUBTASK_HIERARCHY -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from src.db.queries.base.builder import QueryBuilderBase - - -class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): - """ - Survey applicable URLs to determine next subtask to run - - URLs are "inapplicable" if they have any of the following properties: - - Are validated via FlagURLValidated model - - Have at least one annotation with agency suggestion with confidence >= 95 - - Have all possible subtasks completed - - Returns a list of one or more subtasks to run - based on which subtask(s) have the most applicable URLs - (or an empty list if no subtasks have applicable URLs) - """ - - async def run(self, session: AsyncSession) -> list[AutoAgencyIDSubtaskType]: - raise NotImplementedError - - - diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py b/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py deleted file mode 100644 index f0575f0d..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/planner/reconcile.py +++ /dev/null @@ -1,23 +0,0 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.planner.constants import SUBTASK_HIERARCHY -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType - -# TODO: Add test to confirm expected behavior -async def reconcile_tiebreakers( - subtasks: list[AutoAgencyIDSubtaskType] -) -> AutoAgencyIDSubtaskType: - """In the case of multiple subtasks being applicable, - determine which one to run based on priority.""" - - # TODO: Figure out why type hints are mismatched with this - rank: dict[AutoAgencyIDSubtaskType, int] = { - subtask: rank - for rank, subtask in enumerate(SUBTASK_HIERARCHY) - } - - def key(subtask: AutoAgencyIDSubtaskType) -> tuple[int, str]: - r = rank.get(subtask, None) - if r is None: - raise ValueError(f"Subtask {subtask} not found in hierarchy") - return r, subtask.value - - return min(subtasks, key=key) diff --git a/src/core/tasks/url/operators/agency_identification/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/queries/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py similarity index 72% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py index c7cf111e..749332e6 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/planner/constants.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py @@ -6,4 +6,9 @@ AutoAgencyIDSubtaskType.MUCKROCK, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH -] \ No newline at end of file +] + +SUBTASK_HIERARCHY_MAPPING: dict[AutoAgencyIDSubtaskType, int] = { + subtask: idx + for idx, subtask in enumerate(SUBTASK_HIERARCHY) +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py similarity index 53% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py index 4968cf4e..57f30fc3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/planner/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py @@ -1,6 +1,5 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.planner.queries.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ AgencyIDSubtaskSurveyQueryBuilder -from src.core.tasks.url.operators.agency_identification.subtasks.planner.reconcile import reconcile_tiebreakers from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType @@ -13,18 +12,11 @@ def __init__( ) -> None: self.adb_client = adb_client - # TODO: Add test to confirm properly returns one, multiple, or None async def plan_next_subtask(self) -> AutoAgencyIDSubtaskType | None: - applicable_subtasks: list[AutoAgencyIDSubtaskType] = \ + next_subtask: AutoAgencyIDSubtaskType | None = \ await self.adb_client.run_query_builder( AgencyIDSubtaskSurveyQueryBuilder() ) - - # Reconcile tiebreakers - if len(applicable_subtasks) == 0: - return None - if len(applicable_subtasks) > 1: - return await reconcile_tiebreakers(applicable_subtasks) - return applicable_subtasks[0] + return next_subtask diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py new file mode 100644 index 00000000..bcee8ccb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py @@ -0,0 +1,57 @@ +from collections import Counter + +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.constants import SUBTASK_HIERARCHY, \ + SUBTASK_HIERARCHY_MAPPING +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.eligible_counts import \ + ELIGIBLE_COUNTS_QUERY +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): + """ + Survey applicable URLs to determine next subtask to run + + URLs are "inapplicable" if they have any of the following properties: + - Are validated via FlagURLValidated model + - Have at least one annotation with agency suggestion with confidence >= 95 + - Have all possible subtasks completed + + Returns a list of one or more subtasks to run + based on which subtask(s) have the most applicable URLs + (or an empty list if no subtasks have applicable URLs) + """ + + async def run(self, session: AsyncSession) -> AutoAgencyIDSubtaskType | None: + results: RowMapping = await sh.mapping(session, ELIGIBLE_COUNTS_QUERY) + counts: Counter[str] = Counter(results) + max_count: int = max(counts.values()) + if max_count == 0: + return None + subtasks_with_max_count: list[str] = [ + subtask for subtask, count in counts.items() + if count == max_count + ] + subtasks_as_enum_list: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType(subtask) + for subtask in subtasks_with_max_count + ] + # Sort subtasks by priority + sorted_subtasks: list[AutoAgencyIDSubtaskType] = sorted( + subtasks_as_enum_list, + key=lambda subtask: SUBTASK_HIERARCHY_MAPPING[subtask], + reverse=True, + ) + # Return the highest priority subtask + return sorted_subtasks[0] + + + + + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/README.md rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py new file mode 100644 index 00000000..9b0c835e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py @@ -0,0 +1,30 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.high_confidence_annotations import \ + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.validated import \ + VALIDATED_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.ckan import \ + CKAN_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.homepage import \ + HOMEPAGE_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.muckrock import \ + MUCKROCK_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.nlp_location import \ + NLP_LOCATION_CONTAINER +from src.db.models.impl.url.core.sqlalchemy import URL + +ELIGIBLE_CTE = ( + select( + URL.id, + CKAN_SUBTASK_CONTAINER.eligible_query.label("ckan"), + MUCKROCK_SUBTASK_CONTAINER.eligible_query.label("muckrock"), + HOMEPAGE_SUBTASK_CONTAINER.eligible_query.label("homepage"), + NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), + ) + .where( + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, + VALIDATED_EXISTS_CONTAINER.not_exists_query, + ) + .cte("eligible") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py similarity index 52% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py index 85820123..d59c508c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/base.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py @@ -1,7 +1,9 @@ -from sqlalchemy import CTE, Column +from sqlalchemy import CTE, Column, ColumnElement, exists +from src.db.models.impl.url.core.sqlalchemy import URL -class PrereqCTE: + +class ExistsCTEContainer: """ Base class for CTEs that determine validity for each subtask. @@ -11,7 +13,7 @@ class PrereqCTE: def __init__( self, - cte: CTE + cte: CTE, ) -> None: self._cte = cte @@ -21,4 +23,11 @@ def cte(self) -> CTE: @property def url_id(self) -> Column[int]: - return self.cte.columns[0] \ No newline at end of file + return self.cte.columns[0] + + @property + def not_exists_query(self) -> ColumnElement[bool]: + return ( + ~exists() + .where(self.url_id == URL.id) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/ckan.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py new file mode 100644 index 00000000..3ac0ced7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py @@ -0,0 +1,29 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +cte = ( + select( + URL.id + ) + .join( + URLAutoAgencyIDSubtask, + URLAutoAgencyIDSubtask.url_id == URL.id, + ) + .join( + AgencyIDSubtaskSuggestion, + AgencyIDSubtaskSuggestion.subtask_id == URLAutoAgencyIDSubtask.id, + ) + .where( + AgencyIDSubtaskSuggestion.confidence >= 95, + ) + .cte("high_confidence_annotations_exists") +) + +HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py new file mode 100644 index 00000000..f515c1d1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py @@ -0,0 +1,16 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated + +cte = ( + select( + FlagURLValidated.url_id + ) + .cte("validated_exists") +) + +VALIDATED_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/homepage.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py new file mode 100644 index 00000000..9782e4fd --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py @@ -0,0 +1,40 @@ +from sqlalchemy import CTE, ColumnElement, Column, Select, exists, func + +from src.db.models.impl.url.core.sqlalchemy import URL + + +class SubtaskCTEContainer: + """ + CTE for URLs eligible for a given subtask. + A successful left join on this indicates the URL is eligible for the subtask. + A true value for `subtask_entry_exists` indicates + a subtask entry for the URL already exists + """ + + def __init__( + self, + cte: CTE, + ) -> None: + self._cte=cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def entry_exists(self) -> ColumnElement[bool]: + return self.cte.c['subtask_entry_exists'] + + @property + def url_id(self) -> Column[int]: + return self.cte.c['id'] + + @property + def eligible_query(self) -> ColumnElement[int]: + return ( + exists() + .where( + self.url_id == URL.id, + self.entry_exists.is_(False), + ) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py new file mode 100644 index 00000000..b06442ea --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py @@ -0,0 +1,18 @@ +from sqlalchemy import ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask + + +def get_exists_subtask_query( + subtask_type: AutoAgencyIDSubtaskType, +) -> ColumnElement[bool]: + return ( + exists() + .where( + URLAutoAgencyIDSubtask.url_id == URL.id, + URLAutoAgencyIDSubtask.type == subtask_type, + ) + .label("subtask_entry_exists") + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/muckrock.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py new file mode 100644 index 00000000..b1b70cdb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -0,0 +1,37 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.CKAN, + ), + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy == CollectorType.CKAN.value, + + ) + .cte("ckan_eligible") +) + +CKAN_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py new file mode 100644 index 00000000..cf109207 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -0,0 +1,99 @@ +from typing import Sequence + +from sqlalchemy import select, exists + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.views.meta_url import MetaURL + +NOT_ROOT_URL_FLAG = ( + ~exists() + .where( + FlagRootURL.url_id == URL.id, + ) +) + +NOT_META_URL_FLAG = ( + ~exists() + .where( + MetaURL.url_id == URL.id, + ) +) + +BLACKLISTED_ROOTS: Sequence[str] = ( + 'https://www.facebook.com', + 'https://www.countyoffice.org', + '://', + 'https://www.usmarshals.gov', + 'https://www.mapquest.com', + 'https://catalog.data.gov', + 'https://www.muckrock.com' +) + +# Root URL must not be blacklisted +WHITELISTED_ROOT_URL = ( + select( + URL.id + ) + .join( + FlagRootURL, + FlagRootURL.url_id == URL.id, + ) + .where( + URL.url.notin_(BLACKLISTED_ROOTS), + ) + .cte("whitelisted_root_url") +) + +ROOT_URLS_WITH_META_URLS = ( + select( + WHITELISTED_ROOT_URL.c.id + ) + .where( + exists() + .where( + LinkURLRootURL.root_url_id == WHITELISTED_ROOT_URL.c.id, + LinkURLRootURL.url_id == MetaURL.url_id, + ) + ) + .cte("root_urls_with_meta_urls") +) + +HAS_ROOT_URL_WITH_META_URLS = ( + exists() + .where( + LinkURLRootURL.root_url_id == ROOT_URLS_WITH_META_URLS.c.id, + LinkURLRootURL.url_id == URL.id, + ) +) + + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + ) + ) + .join( + LinkURLRootURL, + LinkURLRootURL.url_id == URL.id, + ) + .where( + NOT_META_URL_FLAG, + NOT_ROOT_URL_FLAG, + HAS_ROOT_URL_WITH_META_URLS, + + ) + .cte("homepage_eligible") +) + +HOMEPAGE_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py new file mode 100644 index 00000000..1f059e86 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -0,0 +1,40 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.MUCKROCK, + ) + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy.in_( + (CollectorType.MUCKROCK_ALL_SEARCH.value, + CollectorType.MUCKROCK_COUNTY_SEARCH.value, + CollectorType.MUCKROCK_SIMPLE_SEARCH.value,) + ), + ) + .cte("muckrock_eligible") +) + +MUCKROCK_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py new file mode 100644 index 00000000..40533809 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -0,0 +1,26 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + ) + ) + .join( + URLCompressedHTML + ) + .cte("nlp_location_eligible") +) + +NLP_LOCATION_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py new file mode 100644 index 00000000..6ff2841f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py @@ -0,0 +1,22 @@ +from sqlalchemy import select, ColumnElement, Integer, func + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import ELIGIBLE_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +def sum_count(col: ColumnElement[bool], subtask_type: AutoAgencyIDSubtaskType) -> ColumnElement[int]: + return func.coalesce( + func.sum( + col.cast(Integer) + ), + 0, + ).label(subtask_type.value) + +ELIGIBLE_COUNTS_QUERY = ( + select( + sum_count(ELIGIBLE_CTE.c.ckan, AutoAgencyIDSubtaskType.CKAN), + sum_count(ELIGIBLE_CTE.c.muckrock, AutoAgencyIDSubtaskType.MUCKROCK), + sum_count(ELIGIBLE_CTE.c.homepage, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH), + sum_count(ELIGIBLE_CTE.c.nlp_location, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH), + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py index 2ff45c3e..c4cc6226 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -18,15 +18,19 @@ def __init__( ) -> None: self.adb_client: AsyncDatabaseClient = adb_client self.task_id: int = task_id + self.linked_urls: list[int] | None = None async def run(self) -> AgencyIDSubtaskRunInfo: try: await self.inner_logic() except Exception as e: return AgencyIDSubtaskRunInfo( - error=str(e) + error=f"{type(e).__name__}: {str(e)}", + linked_url_ids=self.linked_urls ) - return AgencyIDSubtaskRunInfo() + return AgencyIDSubtaskRunInfo( + linked_url_ids=self.linked_urls + ) @abc.abstractmethod async def inner_logic(self) -> AgencyIDSubtaskRunInfo: diff --git a/src/db/client/async_.py b/src/db/client/async_.py index a028d404..e89bae4b 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -41,8 +41,8 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.core import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason -from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.review.next.core import GetNextURLForFinalReviewQueryBuilder +from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo @@ -69,8 +69,6 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.core import \ UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.queries.has_urls_without_agency_suggestions import \ - HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder from src.core.tasks.url.operators.html.queries.get import \ @@ -652,7 +650,12 @@ async def get_html_content_info(self, url_id: int) -> list[URLHTMLContentInfo]: return await self.run_query_builder(GetHTMLContentInfoQueryBuilder(url_id)) @session_manager - async def link_urls_to_task(self, session: AsyncSession, task_id: int, url_ids: list[int]): + async def link_urls_to_task( + self, + session: AsyncSession, + task_id: int, + url_ids: list[int] + ) -> None: for url_id in url_ids: link = LinkTaskURL( url_id=url_id, @@ -715,8 +718,6 @@ async def get_tasks( tasks=final_results ) - async def has_urls_without_agency_suggestions(self) -> bool: - return await self.run_query_builder(HasURLsWithoutAgencySuggestionsQueryBuilder()) async def get_next_url_agency_for_annotation( diff --git a/src/db/constants.py b/src/db/constants.py index 3bab368f..f2cdefb1 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,23 +1,11 @@ from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" STANDARD_ROW_LIMIT = 100 -ALL_ANNOTATION_MODELS = [ - AutoRecordTypeSuggestion, - AutoRelevantSuggestion, - # TODO: Revise - # AutomatedUrlAgencySuggestion, - UserRelevantSuggestion, - UserRecordTypeSuggestion, - UserUrlAgencySuggestion -] - USER_ANNOTATION_MODELS = [ UserRelevantSuggestion, UserRecordTypeSuggestion, diff --git a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py index 1dd3d217..f2e9be57 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py @@ -3,14 +3,15 @@ from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel +type_alias = type class URLAutoAgencyIDSubtaskPydantic(BulkInsertableModel): task_id: int url_id: int - subtask: AutoAgencyIDSubtaskType + type: AutoAgencyIDSubtaskType agencies_found: bool detail: SubtaskDetailCode = SubtaskDetailCode.NO_DETAILS @classmethod - def sa_model(cls) -> type[Base]: + def sa_model(cls) -> type_alias[Base]: return URLAutoAgencyIDSubtask \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py index 8066b199..89371498 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -16,7 +16,7 @@ class URLAutoAgencyIDSubtask( __tablename__ = "url_auto_agency_id_subtasks" - subtask = enum_column( + type = enum_column( AutoAgencyIDSubtaskType, name="agency_auto_suggestion_method" ) diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py index e709957a..5a0fd2b8 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py @@ -1,3 +1,5 @@ +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -7,3 +9,8 @@ class AgencyIDSubtaskSuggestionPydantic( subtask_id: int agency_id: int confidence: int + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return AgencyIDSubtaskSuggestion \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index dcf42ab6..929b88bd 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -1,12 +1,12 @@ +import sqlalchemy as sa from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin -from src.db.models.templates_.base import Base +from src.db.models.templates_.with_id import WithIDBase -import sqlalchemy as sa class AgencyIDSubtaskSuggestion( - Base, + WithIDBase, CreatedAtMixin, AgencyDependentMixin, ): diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py new file mode 100644 index 00000000..bc963e11 --- /dev/null +++ b/src/db/models/views/meta_url.py @@ -0,0 +1,26 @@ +""" + CREATE OR REPLACE VIEW meta_url_view AS + SELECT + urls.id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' +""" + +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class MetaURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "meta_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py b/src/db/queries/implementations/core/common/annotation_exists_/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/planner/queries/ctes/nlp_location.py rename to src/db/queries/implementations/core/common/annotation_exists_/__init__.py diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py new file mode 100644 index 00000000..ead32bc0 --- /dev/null +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -0,0 +1,15 @@ +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion + +ALL_ANNOTATION_MODELS = [ + AutoRecordTypeSuggestion, + AutoRelevantSuggestion, + URLAutoAgencyIDSubtask, + UserRelevantSuggestion, + UserRecordTypeSuggestion, + UserUrlAgencySuggestion +] diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists_/core.py similarity index 96% rename from src/db/queries/implementations/core/common/annotation_exists.py rename to src/db/queries/implementations/core/common/annotation_exists_/core.py index bf1c07a1..53e8bcf6 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/core.py @@ -17,7 +17,7 @@ from sqlalchemy import case, func, Select, select from src.collectors.enums import URLStatus -from src.db.constants import ALL_ANNOTATION_MODELS +from src.db.queries.implementations.core.common.annotation_exists_.constants import ALL_ANNOTATION_MODELS from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 37b3a560..5d69be2a 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -11,7 +11,7 @@ from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder class PendingAnnotationExistsCTEQueryBuilder(AnnotationExistsCTEQueryBuilder): diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py index e8d584e7..7fdc96b1 100644 --- a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -2,6 +2,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus +from src.db.dtos.url.mapping import URLMapping from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -25,7 +26,8 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with submitted URLs batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - submitted_url_ids: list[int] = await dbdc.create_submitted_urls(count=2) + submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] await dbdc.create_batch_url_links( batch_id=batch_submitted, url_ids=submitted_url_ids @@ -36,9 +38,10 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with validated URLs batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - validated_url_ids: list[int] = await dbdc.create_validated_urls( + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( count=2 ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] await dbdc.create_batch_url_links( batch_id=batch_validated, url_ids=validated_url_ids diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 306160fa..4b7b4f75 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping from src.db.helpers.connect import get_postgres_connection_string from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -24,17 +25,18 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_ids_error: list[int] = await create_urls( + url_mappings_error: list[URLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) - url_ids_ok: list[int] = await create_urls( + url_mappings_ok: list[URLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.OK, count=11, ) - url_ids_all: list[int] = url_ids_error + url_ids_ok + url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, batch_id=batch_id, diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index 455d9399..0657c66f 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -6,6 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ create_url_data_sources @@ -22,10 +23,11 @@ async def test_get_batches_breakdown_metrics(api_test_helper): adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_ids_1: list[int] = await create_urls( + url_mappings_1: list[URLMapping] = await create_urls( adb_client=adb_client, count=3, ) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] await create_batch_url_links(adb_client=adb_client, batch_id=batch_id_1, url_ids=url_ids_1) await create_validated_flags( adb_client=adb_client, @@ -48,15 +50,17 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_ids: list[int] = await create_urls( + error_url_mappings: list[URLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) - validated_url_ids: list[int] = await create_urls( + error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] + validated_url_mappings: list[URLMapping] = await create_urls( adb_client=adb_client, count=8, ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] await create_validated_flags( adb_client=adb_client, url_ids=validated_url_ids[:3], diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index 9fe7a45c..e48db202 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import SuggestedStatus +from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -24,7 +25,8 @@ async def test_get_backlog_metrics(api_test_helper): # Ensure that multiple days in each month are added to the backlog table, with different values batch_1_id: int = await ddc.create_batch() - url_ids_1: list[int] = await ddc.create_urls(count=3) + url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) submitted_url_ids_1: list[int] = url_ids_1[:2] await ddc.create_validated_flags( @@ -42,16 +44,18 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_2_id: int = await ddc.create_batch() - not_relevant_url_ids_2: list[int] = await ddc.create_urls(count=6) + not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) await ddc.create_validated_flags( url_ids=not_relevant_url_ids_2[:4], validation_type=URLValidatedType.NOT_RELEVANT ) - error_url_ids_2: list[int] = await ddc.create_urls( + error_url_mappings_2: list[URLMapping] = await ddc.create_urls( status=URLStatus.ERROR, count=2 ) + error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( @@ -63,7 +67,8 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_3_id: int = await ddc.create_batch() - url_ids_3: list[int] = await ddc.create_urls(count=12) + url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) await ddc.create_validated_flags( url_ids=url_ids_3[:5], diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index f22ec757..08c52845 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -4,6 +4,7 @@ import pytest from src.collectors.enums import CollectorType, URLStatus +from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -32,23 +33,26 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.MANUAL, date_generated=today - timedelta(days=1) ) - url_ids_0: list[int] = await ddc.create_urls(batch_id=batch_0) - oldest_url_id: int = url_ids_0[0] + url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + oldest_url_id: int = url_mappings_0[0].url_id batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, ) - url_ids_1_ok: list[int] = await ddc.create_urls(batch_id=batch_1, count=1) - url_ids_1_submitted: list[int] = await ddc.create_submitted_urls(count=2) + url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, ) - url_ids_2_ok: list[int] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_ids_2_error: list[int] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) - url_ids_2_validated: list[int] = await ddc.create_validated_urls(count=1, validation_type=URLValidatedType.DATA_SOURCE) - url_ids_2_not_relevant: list[int] = await ddc.create_validated_urls(count=5, validation_type=URLValidatedType.NOT_RELEVANT) + url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLValidatedType.DATA_SOURCE) + url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLValidatedType.NOT_RELEVANT) + url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] + url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] await ddc.create_batch_url_links( url_ids=url_ids_2_validated + url_ids_2_not_relevant, batch_id=batch_2 diff --git a/tests/automated/integration/api/review/test_batch_filtering.py b/tests/automated/integration/api/review/test_batch_filtering.py index 820dc9c0..481f7e90 100644 --- a/tests/automated/integration/api/review/test_batch_filtering.py +++ b/tests/automated/integration/api/review/test_batch_filtering.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from src.db.dtos.url.mapping import URLMapping from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo @@ -17,7 +18,8 @@ async def test_batch_filtering( batch_id: int = batch_url_creation_info.batch_id - validated_url_ids: list[int] = await dbdc.create_validated_urls(count=4) + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls(count=4) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] await dbdc.create_batch_url_links( url_ids=validated_url_ids, batch_id=batch_id diff --git a/tests/automated/integration/core/async_/run_task/test_break_loop.py b/tests/automated/integration/core/async_/run_task/test_break_loop.py index 0d8a9bc2..71b5704f 100644 --- a/tests/automated/integration/core/async_/run_task/test_break_loop.py +++ b/tests/automated/integration/core/async_/run_task/test_break_loop.py @@ -21,9 +21,9 @@ async def test_run_task_break_loop(db_data_creator: DBDataCreator): and an alert should be sent to discord """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, outcome=TaskOperatorOutcome.SUCCESS, task_type=TaskType.HTML ) diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index a7724a45..cda6a6d6 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -21,9 +21,9 @@ async def test_run_task_prereq_met(db_data_creator: DBDataCreator): And a task entry should be created in the database """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, task_type=TaskType.HTML, outcome=TaskOperatorOutcome.SUCCESS, ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py similarity index 79% rename from tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py rename to tests/automated/integration/tasks/url/impl/agency_identification/conftest.py index b6787899..b6a08ee8 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py @@ -7,23 +7,20 @@ from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.mock import mock_run_subtask @pytest.fixture def operator( adb_client_test: AsyncDatabaseClient -): +) -> AgencyIdentificationTaskOperator: operator = AgencyIdentificationTaskOperator( adb_client=adb_client_test, loader=AgencyIdentificationSubtaskLoader( pdap_client=create_autospec(PDAPClient), - muckrock_api_interface=create_autospec(MuckrockAPIInterface) - ) - ) - operator.run_subtask = AsyncMock( - side_effect=mock_run_subtask + muckrock_api_interface=create_autospec(MuckrockAPIInterface), + adb_client=adb_client_test + ), ) return operator diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py deleted file mode 100644 index a48cfc0c..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py +++ /dev/null @@ -1,128 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest -from aiohttp import ClientSession - -from src.collectors.enums import CollectorType -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import \ - MuckrockAgencyIDSubtaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.enums import URLCreationEnum -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 - - -@pytest.mark.asyncio -async def test_agency_identification_task( - db_data_creator: DBDataCreator, - test_client_session: ClientSession, - operator: AgencyIdentificationTaskOperator, -): - """Test full flow of AgencyIdentificationTaskOperator""" - - # Confirm does not yet meet prerequisites - assert not await operator.meets_task_prerequisites() - - collector_type_to_url_id: dict[CollectorType | None, int] = {} - - # Create six urls, one from each strategy - for strategy in [ - CollectorType.COMMON_CRAWLER, - CollectorType.AUTO_GOOGLER, - CollectorType.MUCKROCK_COUNTY_SEARCH, - CollectorType.MUCKROCK_SIMPLE_SEARCH, - CollectorType.MUCKROCK_ALL_SEARCH, - CollectorType.CKAN, - ]: - # Create two URLs for each, one pending and one errored - creation_info: BatchURLCreationInfoV2 = await db_data_creator.batch_v2( - parameters=TestBatchCreationParameters( - strategy=strategy, - urls=[ - TestURLCreationParameters( - count=1, - status=URLCreationEnum.OK, - with_html_content=True - ), - TestURLCreationParameters( - count=1, - status=URLCreationEnum.ERROR, - with_html_content=True - ) - ] - ) - ) - collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLCreationEnum.OK].url_mappings[0].url_id - - # Create an additional two urls with no collector. - response = await db_data_creator.url_v2( - parameters=[ - TestURLCreationParameters( - count=1, - status=URLCreationEnum.OK, - with_html_content=True - ), - TestURLCreationParameters( - count=1, - status=URLCreationEnum.ERROR, - with_html_content=True - ) - ] - ) - collector_type_to_url_id[None] = response.urls_by_status[URLCreationEnum.OK].url_mappings[0].url_id - - - # Confirm meets prerequisites - assert await operator.meets_task_prerequisites() - # Run task - run_info = await operator.run_task() - assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - - # Confirm tasks are piped into the correct subtasks - # * common_crawler into common_crawler_subtask - # * auto_googler into auto_googler_subtask - # * muckrock_county_search into muckrock_subtask - # * muckrock_simple_search into muckrock_subtask - # * muckrock_all_search into muckrock_subtask - # * ckan into ckan_subtask - - - mock_run_subtask: AsyncMock = operator.run_subtask - - # Check correct number of calls to run_subtask - assert mock_run_subtask.call_count == 7 - - # Confirm subtask classes are correct for the given urls - d2 = {} - for call_arg in mock_run_subtask.call_args_list: - subtask_class = call_arg[0][0].__class__ - url_id = call_arg[0][1] - d2[url_id] = subtask_class - - - subtask_class_collector_type = [ - (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_ALL_SEARCH), - (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_COUNTY_SEARCH), - (MuckrockAgencyIDSubtaskOperator, CollectorType.MUCKROCK_SIMPLE_SEARCH), - (CKANAgencyIDSubtaskOperator, CollectorType.CKAN), - (UnknownAgencyIdentificationSubtask, CollectorType.COMMON_CRAWLER), - (UnknownAgencyIdentificationSubtask, CollectorType.AUTO_GOOGLER), - (UnknownAgencyIdentificationSubtask, None) - ] - - for subtask_class, collector_type in subtask_class_collector_type: - url_id = collector_type_to_url_id[collector_type] - assert d2[url_id] == subtask_class - - # Confirm task again does not meet prerequisites - assert not await operator.meets_task_prerequisites() - # # Check confirmed and auto suggestions - adb_client = db_data_creator.adb_client - # TODO: This component appears to be affected by the order of other tests being run - # but does pass when run alone. Resolve. - # await assert_expected_confirmed_and_auto_suggestions(adb_client) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/insert.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/__init__.py diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py new file mode 100644 index 00000000..90aacfa5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py @@ -0,0 +1,100 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.collectors.enums import CollectorType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.external.pdap.enums import MatchAgencyResponseStatus +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.enums import SuggestionType +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_ckan_subtask( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator +): + # Test that ckan subtask correctly sends agency id to + # CKANAPIInterface, sends resultant agency name to + # PDAPClient and adds received suggestions to + # url_agency_suggestions + adb_client: AsyncDatabaseClient = operator.adb_client + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency_name": "Test Agency" + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.CKAN + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is CKAN + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.CKAN + + pdap_client_mock = operator.loader._pdap_client + pdap_client_mock.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) + + # Create agencies + await db_data_creator.create_agency(1) + await db_data_creator.create_agency(2) + + # Run the operator + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm prerequisite no longer met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Verify results + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.CKAN + assert subtask.url_id == applicable_url_id + subtask_id: int = subtask.id + + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all( + AgencyIDSubtaskSuggestion + ) + assert len(suggestions) == 2 + assert {suggestion.confidence for suggestion in suggestions} == {50} + assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} + + # Assert methods called as expected + pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency") diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py new file mode 100644 index 00000000..a128bde1 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.mark.asyncio +async def test_homepage_match(): + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py new file mode 100644 index 00000000..7cf72c5e --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py @@ -0,0 +1,148 @@ +from unittest.mock import MagicMock + +import pytest + +from src.collectors.enums import CollectorType +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType +from src.core.enums import SuggestionType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.enums import MatchAgencyResponseStatus +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_muckrock_subtask( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator +): + adb_client: AsyncDatabaseClient = operator.adb_client + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Add validated URL and confirm no next subtask + await db_data_creator.create_validated_urls(count=1) + + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Add unvalidated URL without collector type + inapplicable_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Should still not have subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Create Auto Googler batch and link to validated URL + inapplicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.AUTO_GOOGLER + ) + await db_data_creator.create_batch_url_links( + url_ids=[inapplicable_url_id], + batch_id=inapplicable_batch_id + ) + + # Confirm prerequisite not met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Create Muckrock batch and link to validated URL + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency": 123 + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.MUCKROCK_SIMPLE_SEARCH + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is Muckrock + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.MUCKROCK + + # Test that muckrock subtask correctly sends agency name to + # MatchAgenciesInterface and adds received suggestions to + # url_agency_suggestions + + # Create mock instances for dependency injections + muckrock_api_interface_mock = operator.loader._muckrock_api_interface + pdap_client_mock = operator.loader._pdap_client + + # Set up mock return values for method calls + muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( + type=AgencyLookupResponseType.FOUND, + name="Mock Agency Name", + error=None + ) + + # Create agencies + await db_data_creator.create_agency(1) + await db_data_creator.create_agency(2) + + pdap_client_mock.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) + + # Run the operator + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm prerequisite no longer met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Verify results + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.MUCKROCK + assert subtask.url_id == applicable_url_id + subtask_id: int = subtask.id + + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all( + AgencyIDSubtaskSuggestion + ) + assert len(suggestions) == 2 + assert {suggestion.confidence for suggestion in suggestions} == {50} + assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} + + + # # Assert methods called as expected + muckrock_api_interface_mock.lookup_agency.assert_called_once_with( + muckrock_agency_id=123 + ) + pdap_client_mock.match_agency.assert_called_once_with( + name="Mock Agency Name" + ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py new file mode 100644 index 00000000..19f5eb5b --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.mark.asyncio +async def test_nlp_location_match(): + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py deleted file mode 100644 index 832ca7df..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py +++ /dev/null @@ -1,58 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest - -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator -from src.core.enums import SuggestionType -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_ckan_subtask(db_data_creator: DBDataCreator): - # Test that ckan subtask correctly sends agency id to - # CKANAPIInterface, sends resultant agency name to - # PDAPClient and adds received suggestions to - # url_agency_suggestions - - pdap_client = AsyncMock() - pdap_client.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) # Assuming MatchAgencyResponse is a class - - # Create an instance of CKANAgencyIdentificationSubtask - task = CKANAgencyIDSubtaskOperator(pdap_client) - - # Call the run method with static values - collector_metadata = {"agency_name": "Test Agency"} - url_id = 1 - - # Call the run method - result = await task.inner_logic(url_id, collector_metadata) - - # Check the result - assert len(result) == 2 - assert result[0].url_id == 1 - assert result[0].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert result[0].pdap_agency_id == 1 - assert result[0].agency_name == "Mock Agency Name" - assert result[1].url_id == 1 - assert result[1].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert result[1].pdap_agency_id == 2 - assert result[1].agency_name == "Another Mock Agency Name" - - # Assert methods called as expected - pdap_client.match_agency.assert_called_once_with(name="Test Agency") - diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py deleted file mode 100644 index f08db57c..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py +++ /dev/null @@ -1,80 +0,0 @@ -from unittest.mock import MagicMock - -import pytest - -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_muckrock_subtask(db_data_creator: DBDataCreator): - # Test that muckrock subtask correctly sends agency name to - # MatchAgenciesInterface and adds received suggestions to - # url_agency_suggestions - - # Create mock instances for dependency injections - muckrock_api_interface_mock = MagicMock(spec=MuckrockAPIInterface) - pdap_client_mock = MagicMock(spec=PDAPClient) - - # Set up mock return values for method calls - muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( - type=AgencyLookupResponseType.FOUND, - name="Mock Agency Name", - error=None - ) - - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) - - # Create an instance of MuckrockAgencyIdentificationSubtask with mock dependencies - muckrock_agency_identification_subtask = MuckrockAgencyIDSubtaskOperator( - muckrock_api_interface=muckrock_api_interface_mock, - pdap_client=pdap_client_mock - ) - - # Run the subtask - results: list[URLAgencySuggestionInfo] = await muckrock_agency_identification_subtask.inner_logic( - url_id=1, - collector_metadata={ - "agency": 123 - } - ) - - # Verify the results - assert len(results) == 2 - assert results[0].url_id == 1 - assert results[0].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert results[0].pdap_agency_id == 1 - assert results[0].agency_name == "Mock Agency Name" - assert results[1].url_id == 1 - assert results[1].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert results[1].pdap_agency_id == 2 - assert results[1].agency_name == "Another Mock Agency Name" - - # Assert methods called as expected - muckrock_api_interface_mock.lookup_agency.assert_called_once_with( - muckrock_agency_id=123 - ) - pdap_client_mock.match_agency.assert_called_once_with( - name="Mock Agency Name" - ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py deleted file mode 100644 index a2a32404..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask - - -@pytest.mark.asyncio -async def test_unknown_agency_identification_subtask(): - # Test that no_collector subtask correctly adds URL to - # url_agency_suggestions with label 'Unknown' - subtask = UnknownAgencyIdentificationSubtask() - results: list[URLAgencySuggestionInfo] = await subtask.inner_logic(url_id=1, collector_metadata={}) - assert len(results) == 1 - assert results[0].url_id == 1 - assert results[0].suggestion_type == SuggestionType.UNKNOWN \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/annotate.py b/tests/helpers/data_creator/commands/impl/annotate.py index 5f341326..1f549615 100644 --- a/tests/helpers/data_creator/commands/impl/annotate.py +++ b/tests/helpers/data_creator/commands/impl/annotate.py @@ -7,7 +7,7 @@ from src.core.enums import SuggestionType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py similarity index 84% rename from tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py rename to tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py index 96743df8..a07aabc2 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py @@ -25,6 +25,7 @@ def __init__( @override async def run(self) -> None: + task_id: int = await self.add_task() suggestions = [] for _ in range(self.count): if self.suggestion_type == SuggestionType.UNKNOWN: @@ -43,4 +44,14 @@ async def run(self) -> None: await self.adb_client.add_agency_auto_suggestions( suggestions=suggestions - ) \ No newline at end of file + ) + + async def add_task(self) -> int: + raise NotImplementedError + + async def create_subtask(self, task_id: int) -> int: + raise NotImplementedError + + async def add_suggestions(self) -> None: + raise NotImplementedError + diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index a27f2c79..4b8b4751 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -27,7 +27,7 @@ from tests.helpers.data_creator.commands.impl.batch_v2 import BatchV2Command from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand from tests.helpers.data_creator.commands.impl.suggestion.agency_confirmed import AgencyConfirmedSuggestionCommand -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand @@ -422,6 +422,7 @@ async def create_urls( status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, count: int = 1, batch_id: int | None = None ) -> list[URLMapping]: @@ -431,6 +432,7 @@ async def create_urls( status=status, source=source, record_type=record_type, + collector_metadata=collector_metadata, count=count ) url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 6054c902..83b2e3f5 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -29,12 +29,14 @@ async def create_urls( status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, count: int = 1 ) -> list[URLMapping]: urls: list[URLInsertModel] = generate_urls( status=status, source=source, record_type=record_type, + collector_metadata=collector_metadata, count=count, ) url_ids = await adb_client.bulk_insert(urls, return_ids=True) diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index efea01cc..5dabc016 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -42,6 +42,7 @@ def generate_urls( status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, count: int = 1 ) -> list[URLInsertModel]: results: list[URLInsertModel] = [] @@ -52,6 +53,7 @@ def generate_urls( status=status, source=source, name=f"Example {val}", + collector_metadata=collector_metadata, record_type=record_type, )) return results From e86e589033b3733b0640ccf8f0aa3608e9d6f2d0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 6 Sep 2025 08:25:15 -0400 Subject: [PATCH 20/33] Resolve existing tests --- ...aee0dd79_overhaul_agency_identification.py | 7 +- .../get/queries/agency_suggestion_/core.py | 2 +- src/api/endpoints/review/next/convert.py | 13 +- src/api/endpoints/review/next/core.py | 78 +- .../review/next/queries/eligible_urls.py | 35 + .../operators/submit_approved/queries/get.py | 2 + .../submit_approved/queries/has_validated.py | 4 + src/db/client/async_.py | 16 - src/db/statement_composer.py | 18 - .../happy_path => api/annotate}/__init__.py | 0 .../api/annotate/agency/__init__.py | 0 .../agency/test_multiple_auto_suggestions.py | 46 ++ .../test_multiple_auto_suggestions_no_html.py | 35 + .../agency/test_other_user_annotation.py | 44 + .../agency/test_single_confirmed_agency.py | 22 + .../test_single_unknown_auto_suggestions.py | 45 ++ .../agency/test_submit_and_get_next.py | 42 + .../api/annotate/agency/test_submit_new.py | 38 + .../integration/api/annotate/all/__init__.py | 0 .../api/annotate/all/test_happy_path.py | 88 ++ .../annotate/all/test_post_batch_filtering.py | 41 + .../api/annotate/all/test_validation_error.py | 27 + .../integration/api/annotate/helpers.py | 22 + .../api/annotate/record_type/__init__.py | 0 .../annotate/record_type/test_record_type.py | 166 ++++ .../api/annotate/relevancy/__init__.py | 0 .../api/annotate/relevancy/test_relevancy.py | 213 +++++ .../integration/api/annotate/test_.py | 0 .../integration/api/test_annotate.py | 756 ------------------ .../core/async_/run_task/test_prereq_met.py | 6 - .../test_new_agency.py | 41 - .../test_validated.py | 2 +- .../happy_path/asserts.py | 19 - .../agency_identification/happy_path/data.py | 34 - .../agency_identification/happy_path/mock.py | 19 - .../test_validated_meta_url.py | 36 +- .../tasks/url/impl/test_url_404_probe.py | 1 + .../impl/suggestion/auto/agency_/core.py | 61 +- tests/helpers/data_creator/core.py | 19 +- tests/helpers/setup/final_review/core.py | 2 +- 40 files changed, 985 insertions(+), 1015 deletions(-) create mode 100644 src/api/endpoints/review/next/queries/eligible_urls.py rename tests/automated/integration/{tasks/url/impl/agency_identification/happy_path => api/annotate}/__init__.py (100%) create mode 100644 tests/automated/integration/api/annotate/agency/__init__.py create mode 100644 tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py create mode 100644 tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py create mode 100644 tests/automated/integration/api/annotate/agency/test_other_user_annotation.py create mode 100644 tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py create mode 100644 tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py create mode 100644 tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py create mode 100644 tests/automated/integration/api/annotate/agency/test_submit_new.py create mode 100644 tests/automated/integration/api/annotate/all/__init__.py create mode 100644 tests/automated/integration/api/annotate/all/test_happy_path.py create mode 100644 tests/automated/integration/api/annotate/all/test_post_batch_filtering.py create mode 100644 tests/automated/integration/api/annotate/all/test_validation_error.py create mode 100644 tests/automated/integration/api/annotate/helpers.py create mode 100644 tests/automated/integration/api/annotate/record_type/__init__.py create mode 100644 tests/automated/integration/api/annotate/record_type/test_record_type.py create mode 100644 tests/automated/integration/api/annotate/relevancy/__init__.py create mode 100644 tests/automated/integration/api/annotate/relevancy/test_relevancy.py create mode 100644 tests/automated/integration/api/annotate/test_.py delete mode 100644 tests/automated/integration/api/test_annotate.py delete mode 100644 tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index 702774d5..e7d9b6fd 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -52,15 +52,18 @@ def upgrade() -> None: _create_url_unknown_agencies_view() _create_meta_url_view() _create_link_agency_id_subtask_agencies_table() + _drop_url_annotation_flags_view() _create_new_url_annotation_flags_view() _drop_url_auto_agency_suggestions_table() - +def _drop_url_annotation_flags_view(): + op.execute(f"DROP VIEW IF EXISTS url_annotation_flags") def downgrade() -> None: _drop_url_unknown_agencies_view() _create_url_auto_agency_suggestions_table() + _drop_url_annotation_flags_view() _create_old_url_annotation_flags_view() _drop_link_agency_id_subtask_agencies_table() _drop_url_auto_agency_subtask_table() @@ -92,7 +95,7 @@ def _create_new_url_annotation_flags_view(): f""" CREATE OR REPLACE VIEW url_annotation_flags AS ( - SELECT u.id, + SELECT u.id as url_id, EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, EXISTS (SELECT 1 FROM public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py index 74740591..a9a33e84 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py @@ -37,7 +37,7 @@ async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo ) .outerjoin( Agency, - Agency.id == cte.agency_id + Agency.agency_id == cte.agency_id ) .where( cte.url_id == self.url_id diff --git a/src/api/endpoints/review/next/convert.py b/src/api/endpoints/review/next/convert.py index 962b7e1e..ca087895 100644 --- a/src/api/endpoints/review/next/convert.py +++ b/src/api/endpoints/review/next/convert.py @@ -4,6 +4,7 @@ from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion @@ -67,8 +68,15 @@ def _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_i def _convert_agency_to_get_next_url_for_agency_agency_info( suggestion_type: SuggestionType, - agency: Agency + agency: Agency | None ) -> GetNextURLForAgencyAgencyInfo: + if agency is None: + if suggestion_type == SuggestionType.UNKNOWN: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + ) + raise ValueError("agency cannot be None for suggestion type other than unknown") + return GetNextURLForAgencyAgencyInfo( suggestion_type=suggestion_type, pdap_agency_id=agency.agency_id, @@ -87,7 +95,8 @@ def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_ if not subtask.agencies_found: count_agencies_not_found += 1 continue - for suggestion in subtask.suggestions: + suggestions: list[AgencyIDSubtaskSuggestion] = subtask.suggestions + for suggestion in suggestions: info: GetNextURLForAgencyAgencyInfo = _convert_agency_to_get_next_url_for_agency_agency_info( suggestion_type=SuggestionType.AUTO_SUGGESTION, agency=suggestion.agency diff --git a/src/api/endpoints/review/next/core.py b/src/api/endpoints/review/next/core.py index 6fb6c95d..1736a970 100644 --- a/src/api/endpoints/review/next/core.py +++ b/src/api/endpoints/review/next/core.py @@ -1,6 +1,4 @@ -from typing import Type - -from sqlalchemy import FromClause, select, and_, Select, desc, asc, func +from sqlalchemy import FromClause, select, Select, desc, asc, func, CTE from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload @@ -9,6 +7,7 @@ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.api.endpoints.review.next.extract import extract_html_content_infos, extract_optional_metadata from src.api.endpoints.review.next.queries.count_reviewed import COUNT_REVIEWED_CTE +from src.api.endpoints.review.next.queries.eligible_urls import build_eligible_urls_cte from src.api.endpoints.review.next.templates.count_cte import CountCTE from src.collectors.enums import URLStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info @@ -22,7 +21,6 @@ from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder @@ -49,13 +47,6 @@ def __init__(self, batch_id: int | None = None): (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, LinkURLAgency.agency) ] - self.triple_join_relationships = [ - ( - URL.auto_agency_subtasks, - URLAutoAgencyIDSubtask.suggestions, - AgencyIDSubtaskSuggestion.agency - ) - ] self.count_label = "count" @@ -70,58 +61,26 @@ def _get_where_exist_clauses( where_clauses.append(where_clause) return where_clauses - def _build_base_query( - self, - anno_exists_query: FromClause, - ) -> Select: - builder = self.anno_exists_builder - where_exist_clauses = self._get_where_exist_clauses( - builder.query - ) + def _build_base_query(self) -> Select: + eligible_urls: CTE = build_eligible_urls_cte(batch_id=self.batch_id) query = ( select( URL, - self._sum_exists_query(anno_exists_query, USER_ANNOTATION_MODELS) ) - .select_from(anno_exists_query) + .select_from( + eligible_urls + ) .join( URL, - URL.id == builder.url_id - ) - ) - if self.batch_id is not None: - query = ( - query.join( - LinkBatchURL - ) - .where( - LinkBatchURL.batch_id == self.batch_id - ) + URL.id == eligible_urls.c.url_id ) - - query = ( - query.where( - and_( - URL.status == URLStatus.OK.value, - *where_exist_clauses - ) + .where( + URL.status == URLStatus.OK.value ) ) return query - - def _sum_exists_query(self, query, models: list[Type[URLDependentMixin]]): - return sum( - [getattr(query.c, self.anno_exists_builder.get_exists_label(model)) for model in models] - ).label(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL) - - - async def _apply_batch_id_filter(self, url_query: Select, batch_id: int | None): - if batch_id is None: - return url_query - return url_query.where(URL.batch_id == batch_id) - async def _apply_options( self, url_query: Select @@ -135,17 +94,11 @@ async def _apply_options( joinedload(primary).joinedload(secondary) for primary, secondary in self.double_join_relationships ], - *[ - joinedload(primary).joinedload(secondary).joinedload(tertiary) - for primary, secondary, tertiary in self.triple_join_relationships - ] + joinedload(URL.auto_agency_subtasks) + .joinedload(URLAutoAgencyIDSubtask.suggestions) + .contains_eager(AgencyIDSubtaskSuggestion.agency) ) - async def _apply_order_clause(self, url_query: Select): - return url_query.order_by( - desc(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL), - asc(URL.id) - ) async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | None: if self.batch_id is None: @@ -172,6 +125,7 @@ async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | return FinalReviewBatchInfo(**raw_result.mappings().one()) async def get_count_ready_query(self): + # TODO: Migrate to separate query builder builder = self.anno_exists_builder count_ready_query = ( select( @@ -261,9 +215,7 @@ async def run( raise FailedQueryException(f"Failed to convert result for url id {result.id} to response") from e async def build_url_query(self): - anno_exists_query = self.anno_exists_builder.query - url_query = self._build_base_query(anno_exists_query) + url_query = self._build_base_query() url_query = await self._apply_options(url_query) - url_query = await self._apply_order_clause(url_query) return url_query diff --git a/src/api/endpoints/review/next/queries/eligible_urls.py b/src/api/endpoints/review/next/queries/eligible_urls.py new file mode 100644 index 00000000..bee5cea2 --- /dev/null +++ b/src/api/endpoints/review/next/queries/eligible_urls.py @@ -0,0 +1,35 @@ +from sqlalchemy import CTE, select, Select + +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView + +uafw = URLAnnotationFlagsView + +def build_eligible_urls_cte(batch_id: int | None = None) -> CTE: + query: Select = ( + select( + uafw.url_id, + ) + .where( + # uafw.has_auto_agency_suggestion.is_(True), + # uafw.has_auto_record_type_suggestion.is_(True), + # uafw.has_auto_relevant_suggestion.is_(True), + uafw.has_user_relevant_suggestion.is_(True), + uafw.has_user_agency_suggestion.is_(True), + uafw.has_user_record_type_suggestion.is_(True), + uafw.was_reviewed.is_(False) + ) + ) + + if batch_id is not None: + query = ( + query.join( + LinkBatchURL, + LinkBatchURL.url_id == uafw.url_id + ) + .where( + LinkBatchURL.batch_id == batch_id + ) + ) + + return query.cte("eligible_urls") diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index dc51dfbb..19b32b5d 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -4,6 +4,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -31,6 +32,7 @@ async def _build_query(): query = ( select(URL) .join(FlagURLValidated, FlagURLValidated.url_id == URL.id) + .where(FlagURLValidated.type == URLValidatedType.DATA_SOURCE) .options( selectinload(URL.optional_data_source_metadata), selectinload(URL.confirmed_agencies), diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py index a554b8be..5a3ff464 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py @@ -2,6 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -16,6 +17,9 @@ async def run(self, session: AsyncSession) -> bool: FlagURLValidated, FlagURLValidated.url_id == URL.id ) + .where( + FlagURLValidated.type == URLValidatedType.DATA_SOURCE + ) ) urls = await session.execute(query) urls = urls.scalars().all() diff --git a/src/db/client/async_.py b/src/db/client/async_.py index e89bae4b..19cbc3f5 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -766,22 +766,6 @@ async def add_confirmed_agency_url_links( ) session.add(confirmed_agency) - @session_manager - async def add_agency_auto_suggestions( - self, - session: AsyncSession, - suggestions: list[URLAgencySuggestionInfo] - ): - raise NotImplementedError("Revise") - # for suggestion in suggestions: - # url_agency_suggestion = AutomatedUrlAgencySuggestion( - # url_id=suggestion.url_id, - # agency_id=suggestion.pdap_agency_id, - # is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN, - # confidence=0 - # ) - # session.add(url_agency_suggestion) - @session_manager async def add_agency_manual_suggestion( self, diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 69e87219..8e172733 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -72,24 +72,6 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: func.count(attr_value).label(label) ).group_by(attr_value).subquery() - @staticmethod - def exclude_urls_with_agency_suggestions( - statement: Select - ): - raise NotImplementedError - # # Aliases for clarity - # AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - # - # # Exclude if automated suggestions exist - # statement = statement.where( - # ~exists().where(AutomatedSuggestion.url_id == URL.id) - # ) - # # Exclude if confirmed agencies exist - # statement = statement.where( - # ~exists().where(LinkURLAgency.url_id == URL.id) - # ) - # return statement - @staticmethod def pending_urls_missing_miscellaneous_metadata_query() -> Select: query = select(URL).where( diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py b/tests/automated/integration/api/annotate/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py rename to tests/automated/integration/api/annotate/__init__.py diff --git a/tests/automated/integration/api/annotate/agency/__init__.py b/tests/automated/integration/api/annotate/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py new file mode 100644 index 00000000..65b20b0c --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py @@ -0,0 +1,46 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that two agency_suggestions exist + assert len(next_annotation.agency_suggestions) == 2 + + for agency_suggestion in next_annotation.agency_suggestions: + assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION + assert agency_suggestion.pdap_agency_id is not None + assert agency_suggestion.agency_name is not None + assert agency_suggestion.state is not None + assert agency_suggestion.county is not None + assert agency_suggestion.locality is not None diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py new file mode 100644 index 00000000..5bcb4569 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py @@ -0,0 +1,35 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=False + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is not present + assert next_annotation.html_info.description == "" + assert next_annotation.html_info.title == "" diff --git a/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py new file mode 100644 index 00000000..a3ecae79 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py @@ -0,0 +1,44 @@ +import pytest + +from tests.automated.integration.api.conftest import MOCK_USER_ID +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_other_user_annotation(api_test_helper): + """ + Test Scenario: Other User Annotation + A URL has been annotated by another User + Our user should still receive this URL to annotate + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + # Test that another user can insert a suggestion + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=url_ids[0], + ) + + # After this, text that our user does not receive this URL + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py new file mode 100644 index 00000000..e38421e1 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py @@ -0,0 +1,22 @@ +import pytest + +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_confirmed_agency(api_test_helper): + """ + Test Scenario: Single Confirmed Agency + A URL has a single Confirmed Agency and has not been annotated by the User + The user should not receive this URL to annotate + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.confirmed_suggestions( + url_ids=buci.url_ids, + ) + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py new file mode 100644 index 00000000..f911bba5 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): + """ + Test Scenario: Single Unknown Auto Suggestion + A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User + The user should receive a single Unknown Auto Suggestion lacking other detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + agency_suggestion = next_annotation.agency_suggestions[0] + + assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN + assert agency_suggestion.pdap_agency_id is None + assert agency_suggestion.agency_name is None + assert agency_suggestion.state is None + assert agency_suggestion.county is None + assert agency_suggestion.locality is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py new file mode 100644 index 00000000..91049daa --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py @@ -0,0 +1,42 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_and_get_next(api_test_helper): + """ + Test Scenario: Submit and Get Next (no other URL available) + A URL has been annotated by our User, and no other valid URLs have not been annotated + Our user should not receive another URL to annotate + Until another relevant URL is added + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=2 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and receive the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + + ) + assert response.next_annotation is not None + + # User should submit this annotation and receive none for the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[1], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + ) + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_new.py b/tests/automated/integration/api/annotate/agency/test_submit_new.py new file mode 100644 index 00000000..e82c767f --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_new.py @@ -0,0 +1,38 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_new(api_test_helper): + """ + Test Scenario: Submit New + Our user receives an annotation and marks it as `NEW` + This should complete successfully + And within the database the annotation should be marked as `NEW` + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and mark it as New + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=True + ) + ) + assert response.next_annotation is None + + # Within database, the annotation should be marked as `NEW` + all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_manual_suggestions) == 1 + assert all_manual_suggestions[0].is_new diff --git a/tests/automated/integration/api/annotate/all/__init__.py b/tests/automated/integration/api/annotate/all/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py new file mode 100644 index 00000000..5003f08f --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -0,0 +1,88 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all(api_test_helper): + """ + Test the happy path workflow for the all-annotations endpoint + The user should be able to get a valid URL (filtering on batch id if needed), + submit a full annotation, and receive another URL + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_2 = setup_info_2.url_mapping + + # First, get a valid URL to annotate + get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() + + # Apply the second batch id as a filter and see that a different URL is returned + get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( + batch_id=setup_info_2.batch_id + ) + + assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id + + # Annotate the first and submit + agency_id = await ath.db_data_creator.agency() + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=False, + suggested_agency=agency_id + ) + ) + ) + assert post_response_1.next_annotation is not None + + # Confirm the second is received + assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id + + # Upon submitting the second, confirm that no more URLs are returned through either POST or GET + post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_2.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + ) + ) + assert post_response_2.next_annotation is None + + get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() + assert get_response_3.next_annotation is None + + + # Check that all annotations are present in the database + + # Should be two relevance annotations, one True and one False + all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(all_relevance_suggestions) == 2 + assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value + assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value + + # Should be one agency + all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_agency_suggestions) == 1 + assert all_agency_suggestions[0].is_new == False + assert all_agency_suggestions[0].agency_id == agency_id + + # Should be one record type + all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(all_record_type_suggestions) == 1 + assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value diff --git a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py new file mode 100644 index 00000000..a11c43a3 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py @@ -0,0 +1,41 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_post_batch_filtering(api_test_helper): + """ + Batch filtering should also work when posting annotations + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + setup_info_3 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_3 = setup_info_3.url_mapping + + # Submit the first annotation, using the third batch id, and receive the third URL + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + batch_id=setup_info_3.batch_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=True + ) + ) + ) + + assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id diff --git a/tests/automated/integration/api/annotate/all/test_validation_error.py b/tests/automated/integration/api/annotate/all/test_validation_error.py new file mode 100644 index 00000000..b805a435 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_validation_error.py @@ -0,0 +1,27 @@ +import pytest + +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.core.exceptions import FailedValidationException +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_validation_error(api_test_helper): + """ + Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response + """ + ath = api_test_helper + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + + with pytest.raises(FailedValidationException) as e: + response = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS + ) + ) diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py new file mode 100644 index 00000000..39cfedab --- /dev/null +++ b/tests/automated/integration/api/annotate/helpers.py @@ -0,0 +1,22 @@ +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.dtos.url.mapping import URLMapping + + +def check_url_mappings_match( + map_1: URLMapping, + map_2: URLMapping +): + assert map_1.url_id == map_2.url_id + assert map_2.url == map_2.url + + +def check_html_info_not_empty( + html_info: ResponseHTMLInfo +): + assert not html_info_empty(html_info) + + +def html_info_empty( + html_info: ResponseHTMLInfo +) -> bool: + return html_info.description == "" and html_info.title == "" diff --git a/tests/automated/integration/api/annotate/record_type/__init__.py b/tests/automated/integration/api/annotate/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/record_type/test_record_type.py b/tests/automated/integration/api/annotate/record_type/test_record_type.py new file mode 100644 index 00000000..5e6d8917 --- /dev/null +++ b/tests/automated/integration/api/annotate/record_type/test_record_type.py @@ -0,0 +1,166 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo +from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo +from src.core.enums import RecordType +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_record_type(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct record type is returned + assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS + + # Annotate with value 'Personnel Records' and get next URL + request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.PERSONNEL_RECORDS + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match(inner_info_2.url_info, url_2) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value + + # If user submits annotation for same URL, the URL should be overwritten + + request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.BOOKING_REPORTS + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert result.record_type == RecordType.BOOKING_REPORTS.value + + +@pytest.mark.asyncio +async def test_annotate_record_type_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_record_type_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_record_type_annotation_and_get_next( + url_id=creation_info.url_ids[0], + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_record_type_no_html_info(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/relevancy/__init__.py b/tests/automated/integration/api/annotate/relevancy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/relevancy/test_relevancy.py b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py new file mode 100644 index 00000000..387d68c0 --- /dev/null +++ b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py @@ -0,0 +1,213 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo +from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo +from src.core.enums import SuggestedStatus +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_relevancy(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct relevant value is returned + assert inner_info_1.annotation.is_relevant is True + + # A second user should see the same URL + + + # Annotate with value 'False' and get next URL + request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match( + inner_info_2.url_info, + url_2 + ) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.suggested_status == SuggestedStatus.RELEVANT.value + + # If user submits annotation for same URL, the URL should be overwritten + request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert results[0].suggested_status == SuggestedStatus.RELEVANT.value + + +async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): + response = ath.request_validator.post_relevance_annotation_and_get_next( + url_id=url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=annotation + ) + ) + + assert response.next_annotation is None + + results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) + assert len(results) == 1 + assert results[0].suggested_status == annotation.value + + +@pytest.mark.asyncio +async def test_annotate_relevancy_broken_page(api_test_helper): + ath = api_test_helper + + creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.BROKEN_PAGE_404 + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_individual_record(api_test_helper): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.INDIVIDUAL_RECORD + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_relevant_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + suggested_status=SuggestedStatus.RELEVANT + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_relevance_annotation_and_get_next( + url_id=creation_info.url_ids[0], + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_relevancy_no_html(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/test_.py b/tests/automated/integration/api/annotate/test_.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py deleted file mode 100644 index 51688765..00000000 --- a/tests/automated/integration/api/test_annotate.py +++ /dev/null @@ -1,756 +0,0 @@ -from http import HTTPStatus - -import pytest -from fastapi import HTTPException - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo -from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.core.error_manager.enums import ErrorTypes -from src.core.enums import RecordType, SuggestionType, SuggestedStatus -from src.core.exceptions import FailedValidationException -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion -from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo -from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review -from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency -from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo -from tests.automated.integration.api.conftest import MOCK_USER_ID - -def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping -): - assert map_1.url_id == map_2.url_id - assert map_2.url == map_2.url - -def check_html_info_not_empty( - html_info: ResponseHTMLInfo -): - assert not html_info_empty(html_info) - -def html_info_empty( - html_info: ResponseHTMLInfo -) -> bool: - return html_info.description == "" and html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_relevancy(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct relevant value is returned - assert inner_info_1.annotation.is_relevant is True - - # A second user should see the same URL - - - # Annotate with value 'False' and get next URL - request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match( - inner_info_2.url_info, - url_2 - ) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.suggested_status == SuggestedStatus.RELEVANT.value - - # If user submits annotation for same URL, the URL should be overwritten - request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert results[0].suggested_status == SuggestedStatus.RELEVANT.value - -async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): - response = ath.request_validator.post_relevance_annotation_and_get_next( - url_id=url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=annotation - ) - ) - - assert response.next_annotation is None - - results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) - assert len(results) == 1 - assert results[0].suggested_status == annotation.value - -@pytest.mark.asyncio -async def test_annotate_relevancy_broken_page(api_test_helper): - ath = api_test_helper - - creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.BROKEN_PAGE_404 - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_individual_record(api_test_helper): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.INDIVIDUAL_RECORD - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_relevant_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - suggested_status=SuggestedStatus.RELEVANT - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_relevance_annotation_and_get_next( - url_id=creation_info.url_ids[0], - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_relevancy_no_html(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_record_type(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct record type is returned - assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS - - # Annotate with value 'Personnel Records' and get next URL - request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.PERSONNEL_RECORDS - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match(inner_info_2.url_info, url_2) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value - - # If user submits annotation for same URL, the URL should be overwritten - - request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.BOOKING_REPORTS - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert result.record_type == RecordType.BOOKING_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_record_type_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_record_type_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_record_type_annotation_and_get_next( - url_id=creation_info.url_ids[0], - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_record_type_no_html_info(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that two agency_suggestions exist - assert len(next_annotation.agency_suggestions) == 2 - - for agency_suggestion in next_annotation.agency_suggestions: - assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION - assert agency_suggestion.pdap_agency_id is not None - assert agency_suggestion.agency_name is not None - assert agency_suggestion.state is not None - assert agency_suggestion.county is not None - assert agency_suggestion.locality is not None - - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=False - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is not present - assert next_annotation.html_info.description == "" - assert next_annotation.html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): - """ - Test Scenario: Single Unknown Auto Suggestion - A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User - The user should receive a single Unknown Auto Suggestion lacking other detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN - ) - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - agency_suggestion = next_annotation.agency_suggestions[0] - - assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN - assert agency_suggestion.pdap_agency_id is None - assert agency_suggestion.agency_name is None - assert agency_suggestion.state is None - assert agency_suggestion.county is None - assert agency_suggestion.locality is None - - -@pytest.mark.asyncio -async def test_annotate_agency_single_confirmed_agency(api_test_helper): - """ - Test Scenario: Single Confirmed Agency - A URL has a single Confirmed Agency and has not been annotated by the User - The user should not receive this URL to annotate - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.confirmed_suggestions( - url_ids=buci.url_ids, - ) - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_other_user_annotation(api_test_helper): - """ - Test Scenario: Other User Annotation - A URL has been annotated by another User - Our user should still receive this URL to annotate - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - # Test that another user can insert a suggestion - await ath.db_data_creator.manual_suggestion( - user_id=MOCK_USER_ID + 1, - url_id=url_ids[0], - ) - - # After this, text that our user does not receive this URL - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_submit_and_get_next(api_test_helper): - """ - Test Scenario: Submit and Get Next (no other URL available) - A URL has been annotated by our User, and no other valid URLs have not been annotated - Our user should not receive another URL to annotate - Until another relevant URL is added - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=2 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and receive the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - - ) - assert response.next_annotation is not None - - # User should submit this annotation and receive none for the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[1], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - ) - assert response.next_annotation is None - - -@pytest.mark.asyncio -async def test_annotate_agency_submit_new(api_test_helper): - """ - Test Scenario: Submit New - Our user receives an annotation and marks it as `NEW` - This should complete successfully - And within the database the annotation should be marked as `NEW` - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and mark it as New - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=True - ) - ) - assert response.next_annotation is None - - # Within database, the annotation should be marked as `NEW` - all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_manual_suggestions) == 1 - assert all_manual_suggestions[0].is_new - -@pytest.mark.asyncio -async def test_annotate_all(api_test_helper): - """ - Test the happy path workflow for the all-annotations endpoint - The user should be able to get a valid URL (filtering on batch id if needed), - submit a full annotation, and receive another URL - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_2 = setup_info_2.url_mapping - - # First, get a valid URL to annotate - get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() - - # Apply the second batch id as a filter and see that a different URL is returned - get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( - batch_id=setup_info_2.batch_id - ) - - assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id - - # Annotate the first and submit - agency_id = await ath.db_data_creator.agency() - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=False, - suggested_agency=agency_id - ) - ) - ) - assert post_response_1.next_annotation is not None - - # Confirm the second is received - assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id - - # Upon submitting the second, confirm that no more URLs are returned through either POST or GET - post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_2.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - ) - ) - assert post_response_2.next_annotation is None - - get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() - assert get_response_3.next_annotation is None - - - # Check that all annotations are present in the database - - # Should be two relevance annotations, one True and one False - all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(all_relevance_suggestions) == 2 - assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value - assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value - - # Should be one agency - all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_agency_suggestions) == 1 - assert all_agency_suggestions[0].is_new == False - assert all_agency_suggestions[0].agency_id == agency_id - - # Should be one record type - all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(all_record_type_suggestions) == 1 - assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_all_post_batch_filtering(api_test_helper): - """ - Batch filtering should also work when posting annotations - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - setup_info_3 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_3 = setup_info_3.url_mapping - - # Submit the first annotation, using the third batch id, and receive the third URL - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - batch_id=setup_info_3.batch_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=True - ) - ) - ) - - assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id - - -@pytest.mark.asyncio -async def test_annotate_all_validation_error(api_test_helper): - """ - Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response - """ - ath = api_test_helper - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - - with pytest.raises(FailedValidationException) as e: - response = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS - ) - ) diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index cda6a6d6..e5425fd9 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -18,7 +18,6 @@ async def test_run_task_prereq_met(db_data_creator: DBDataCreator): """ When a task pre-requisite is met, the task should be run - And a task entry should be created in the database """ async def run_task(self) -> TaskOperatorRunInfo: @@ -48,9 +47,4 @@ async def run_task(self) -> TaskOperatorRunInfo: # There should be two calls to meets_task_prerequisites mock_operator.meets_task_prerequisites.assert_has_calls([call(), call()]) - results = await db_data_creator.adb_client.get_all(Task) - - assert len(results) == 1 - assert results[0].task_status == BatchStatus.IN_PROCESS.value - core.task_manager.conclude_task.assert_called_once() diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py deleted file mode 100644 index 72430fec..00000000 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.core.enums import SuggestedStatus, RecordType, SuggestionType -from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_get_next_url_for_final_review_new_agency(db_data_creator: DBDataCreator): - """ - Test that a URL with a new agency is properly returned - """ - - # Apply batch v2 - parameters = TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.RELEVANT, - user_agency=URLAgencyAnnotationPostInfo( - is_new=True - ), - user_record_type=RecordType.ARREST_RECORDS - ) - ) - ] - ) - creation_info = await db_data_creator.batch_v2(parameters) - outer_result = await db_data_creator.adb_client.get_next_url_for_final_review( - batch_id=None - ) - result = outer_result.next_source - - assert result is not None - user_suggestion = result.annotations.agency.user - assert user_suggestion.suggestion_type == SuggestionType.NEW_AGENCY - assert user_suggestion.pdap_agency_id is None - assert user_suggestion.agency_name is None diff --git a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py index 7ddc11fb..ab5acd59 100644 --- a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py +++ b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py @@ -14,7 +14,7 @@ async def test_get_next_url_for_user_relevance_annotation_validated( A validated URL should not turn up in get_next_url_for_user_annotation """ dbdc = db_data_creator - url_1: int = (await dbdc.create_validated_urls())[0] + url_1: int = (await dbdc.create_validated_urls())[0].url_id # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py deleted file mode 100644 index 50748b7a..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency - - -async def assert_expected_confirmed_and_auto_suggestions(adb_client: AsyncDatabaseClient): - confirmed_suggestions = await adb_client.get_urls_with_confirmed_agencies() - - # The number of confirmed suggestions is dependent on how often - # the subtask iterated through the sample agency suggestions defined in `data.py` - assert len(confirmed_suggestions) == 3, f"Expected 3 confirmed suggestions, got {len(confirmed_suggestions)}" - agencies = await adb_client.get_all(Agency) - assert len(agencies) == 2 - raise NotImplementedError("Revise") - # auto_suggestions = await adb_client.get_all(AutomatedUrlAgencySuggestion) - assert len(auto_suggestions) == 4, f"Expected 4 auto suggestions, got {len(auto_suggestions)}" - # Of the auto suggestions, 2 should be unknown - assert len([s for s in auto_suggestions if s.is_unknown]) == 2 - # Of the auto suggestions, 2 should not be unknown - assert len([s for s in auto_suggestions if not s.is_unknown]) == 2 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py deleted file mode 100644 index ea224c37..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py +++ /dev/null @@ -1,34 +0,0 @@ - - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo - -SAMPLE_AGENCY_SUGGESTIONS = [ - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ), - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=-1, - agency_name="Test Agency", - state="Test State", - county="Test County", - locality="Test Locality" - ), - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=-1, - agency_name="Test Agency 2", - state="Test State 2", - county="Test County 2", - locality="Test Locality 2" - ) -] diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py deleted file mode 100644 index a4dcb227..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py +++ /dev/null @@ -1,19 +0,0 @@ -from copy import deepcopy -from typing import Optional - -from src.core.enums import SuggestionType -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.data import SAMPLE_AGENCY_SUGGESTIONS - - -async def mock_run_subtask( - subtask, - url_id: int, - collector_metadata: Optional[dict] -): - """A mocked version of run_subtask that returns a single suggestion for each url_id.""" - - # Deepcopy to prevent using the same instance in memory - suggestion = deepcopy(SAMPLE_AGENCY_SUGGESTIONS[url_id % 3]) - suggestion.url_id = url_id - suggestion.pdap_agency_id = (url_id % 3) if suggestion.suggestion_type != SuggestionType.UNKNOWN else None - return [suggestion] diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py index 6fd524a8..5f927159 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py @@ -1,10 +1,42 @@ import pytest +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.external.pdap.client import PDAPClient +from tests.helpers.asserts import assert_task_run_success + @pytest.mark.asyncio -async def test_validated_meta_url_not_included(): +async def test_validated_meta_url_not_included( + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch +): """ If a validated Meta URL is included in the database This should not be included in the submit approved task """ - raise NotImplementedError \ No newline at end of file + + # Get Task Operator + operator = SubmitApprovedURLTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) + + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls( + validation_type=URLValidatedType.META_URL + ))[0].url_id + + # Test task operator does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm entry not included in database + ds_urls: list[URLDataSource] = await dbdc.adb_client.get_all(URLDataSource) + assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 50df6aef..e55ad9ad 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -20,6 +20,7 @@ @pytest.mark.asyncio async def test_url_404_probe_task( + wiped_database, db_data_creator: DBDataCreator ): diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py index a07aabc2..fe54c6f9 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py @@ -4,6 +4,10 @@ from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand @@ -14,44 +18,61 @@ def __init__( self, url_id: int, count: int, - suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION + suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION, + subtask_type: AutoAgencyIDSubtaskType = AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + confidence: int = 50 ): super().__init__() if suggestion_type == SuggestionType.UNKNOWN: count = 1 # Can only be one auto suggestion if unknown + agencies_found = False + else: + agencies_found = True self.url_id = url_id self.count = count self.suggestion_type = suggestion_type + self.subtask_type = subtask_type + self.confidence = confidence + self.agencies_found = agencies_found @override async def run(self) -> None: task_id: int = await self.add_task() - suggestions = [] + subtask_id: int = await self.create_subtask(task_id) + if not self.agencies_found: + return + + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] for _ in range(self.count): - if self.suggestion_type == SuggestionType.UNKNOWN: - pdap_agency_id = None - else: - pdap_agency_id = await self.run_command(AgencyCommand()) - suggestion = URLAgencySuggestionInfo( - url_id=self.url_id, - suggestion_type=self.suggestion_type, - pdap_agency_id=pdap_agency_id, - state="Test State", - county="Test County", - locality="Test Locality" + pdap_agency_id: int = await self.run_command(AgencyCommand()) + + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=pdap_agency_id, + confidence=self.confidence, ) suggestions.append(suggestion) - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions + await self.adb_client.bulk_insert( + models=suggestions, ) async def add_task(self) -> int: - raise NotImplementedError + task_id: int = await self.adb_client.initiate_task( + task_type=TaskType.AGENCY_IDENTIFICATION, + ) + return task_id async def create_subtask(self, task_id: int) -> int: - raise NotImplementedError - - async def add_suggestions(self) -> None: - raise NotImplementedError + obj: URLAutoAgencyIDSubtaskPydantic = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + type=self.subtask_type, + url_id=self.url_id, + agencies_found=self.agencies_found, + ) + subtask_id: int = (await self.adb_client.bulk_insert( + models=[obj], + return_ids=True + ))[0] + return subtask_id diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 4b8b4751..6c597f3f 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -203,23 +203,14 @@ async def auto_suggestions( raise ValueError(f"suggestion_type must be one of {allowed_suggestion_types}") if suggestion_type == SuggestionType.UNKNOWN and num_suggestions > 1: raise ValueError("num_suggestions must be 1 when suggestion_type is unknown") - + for url_id in url_ids: - suggestions = [] - for i in range(num_suggestions): - if suggestion_type == SuggestionType.UNKNOWN: - agency_id = None - else: - agency_id = await self.agency() - suggestion = URLAgencySuggestionInfo( + await self.run_command( + AgencyAutoSuggestionsCommand( url_id=url_id, - suggestion_type=suggestion_type, - pdap_agency_id=agency_id + count=num_suggestions, + suggestion_type=suggestion_type ) - suggestions.append(suggestion) - - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions ) async def confirmed_suggestions(self, url_ids: list[int]): diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index 6c4a3498..58b1ae49 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -37,7 +37,7 @@ async def add_agency_suggestion() -> int: ) return agency_id - async def add_record_type_suggestion(record_type: RecordType): + async def add_record_type_suggestion(record_type: RecordType) -> None: await db_data_creator.user_record_type_suggestion( url_id=url_mapping.url_id, record_type=record_type From e36bf180cf1e71246baae5b51c945fac4c5dcf02 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 6 Sep 2025 09:11:11 -0400 Subject: [PATCH 21/33] Continue draft --- .../impl/homepage_match_/{query.py => queries/__init__.py} | 0 .../subtasks/impl/homepage_match_/queries/get.py | 0 .../subtasks/impl/homepage_match_/queries/insert.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/{query.py => queries/__init__.py} (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/query.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py new file mode 100644 index 00000000..e69de29b From 2ac254e5f8ecffff5c0f9b78828ee79b5c4d0e93 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 6 Sep 2025 10:43:04 -0400 Subject: [PATCH 22/33] Begin setting up Homepage CTE and additional views --- ...aee0dd79_overhaul_agency_identification.py | 26 +++++++++-- .../homepage_match_/queries/ctes/__init__.py | 0 .../queries/ctes/meta_urls_with_root.py | 23 ++++++++++ .../ctes/meta_urls_with_root_agencies.py | 20 ++++++++ .../ctes/unvalidated_urls_with_root.py | 21 +++++++++ .../queries/ctes/whitelisted_root_urls.py | 46 +++++++++++++++++++ src/db/models/views/unvalidated_url.py | 27 +++++++++++ 7 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py create mode 100644 src/db/models/views/unvalidated_url.py diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index e7d9b6fd..428aff9b 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -26,6 +26,7 @@ LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "agency_id_subtask_suggestions" META_URL_VIEW_NAME: str = "meta_url_view" +UNVALIDATED_URL_VIEW_NAME: str = "unvalidated_url_view" URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" @@ -55,9 +56,7 @@ def upgrade() -> None: _drop_url_annotation_flags_view() _create_new_url_annotation_flags_view() _drop_url_auto_agency_suggestions_table() - -def _drop_url_annotation_flags_view(): - op.execute(f"DROP VIEW IF EXISTS url_annotation_flags") + _create_unvalidated_urls_view() def downgrade() -> None: @@ -69,6 +68,27 @@ def downgrade() -> None: _drop_url_auto_agency_subtask_table() _drop_meta_url_view() SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) + _drop_unvalidated_urls_view() + +def _create_unvalidated_urls_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {UNVALIDATED_URL_VIEW_NAME} as + select + u.id as url_id + from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id + where + fuv.type is null + """) + +def _drop_unvalidated_urls_view(): + op.execute(f"DROP VIEW IF EXISTS {UNVALIDATED_URL_VIEW_NAME}") + + +def _drop_url_annotation_flags_view(): + op.execute(f"DROP VIEW IF EXISTS url_annotation_flags") def _drop_meta_url_view(): diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py new file mode 100644 index 00000000..63b6b417 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py @@ -0,0 +1,23 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.meta_url import MetaURL + +META_ROOT_URLS_CTE: CTE = ( + select( + MetaURL.url_id.label("meta_url_id"), + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + MetaURL.url_id == LinkURLRootURL.url_id + ) + # Must be a Whitelisted Root URL + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) + .cte("meta_root_urls") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py new file mode 100644 index 00000000..bd388f8f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +META_ROOT_URLS_WITH_AGENCIES: CTE = ( + select( + META_ROOT_URLS_CTE.c.url_id, + META_ROOT_URLS_CTE.c.root_url_id, + LinkURLAgency.agency_id + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .cte( + "meta_root_urls_with_agencies" + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py new file mode 100644 index 00000000..bdfaa046 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py @@ -0,0 +1,21 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.unvalidated_url import UnvalidatedURL + +UNVALIDATED_URLS_WITH_ROOT: CTE = ( + select( + UnvalidatedURL.url_id, + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + UnvalidatedURL.url_id == LinkURLRootURL.url_id + ) + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py new file mode 100644 index 00000000..66f7c777 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -0,0 +1,46 @@ +from sqlalchemy import CTE, select, func + +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.sqlalchemy import URL + +WHITELISTED_ROOT_URLS_CTE: CTE = ( + # TODO: Check for no fan-out + select( + URL.id + ) + .join( + FlagRootURL, + URL.id == FlagRootURL.url_id + ) + # Must be linked to other URLs + .join( + LinkURLRootURL, + URL.id == LinkURLRootURL.root_url_id + ) + # Those URLs must be meta URLS + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLRootURL.url_id + ) + # Get the Agency URLs for those URLs + .join( + LinkURLAgency, + LinkURLAgency.url_id == LinkURLRootURL.url_id + ) + .where( + # The connected URLs must be Meta URLs + FlagURLValidated.type == URLValidatedType.META_URL + ) + .group_by( + URL.id + ) + # Must have no more than two agencies connected + .having( + func.count(LinkURLAgency.agency_id) <= 2 + ) + .cte("whitelisted_root_urls") +) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py new file mode 100644 index 00000000..767ee960 --- /dev/null +++ b/src/db/models/views/unvalidated_url.py @@ -0,0 +1,27 @@ +""" +select + u.id as url_id +from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id +where + fuv.type is null +""" +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class UnvalidatedURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "unvalidated_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file From fd16c86c4c4d85c02e40bdf0c36fd2c9f9bc99be Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 6 Sep 2025 14:03:08 -0400 Subject: [PATCH 23/33] Continue Draft --- ...aee0dd79_overhaul_agency_identification.py | 5 ++-- .../queries/ctes/consolidated.py | 27 +++++++++++++++++++ .../queries/ctes/count_agency_per_url.py | 20 ++++++++++++++ .../queries/ctes/multi_agency_case.py | 18 +++++++++++++ .../queries/ctes/single_agency_case.py | 18 +++++++++++++ .../impl/homepage_match_/queries/get.py | 21 +++++++++++++++ .../impl/homepage_match_/queries/insert.py | 4 +++ .../url/suggestion/agency/subtask/enum.py | 3 +-- 8 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py index 428aff9b..39703fde 100644 --- a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -38,9 +38,8 @@ SUBTASK_DETAIL_CODE_ENUM = sa.Enum( 'no details', 'retrieval error', - 'case-homepage-single agency', - 'case-homepage-no data sources', - 'case-homepage-multi agency nonzero data sources', + 'homepage-single agency', + 'homepage-multi agency', name="agency_id_subtask_detail_code", ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py new file mode 100644 index 00000000..993d109a --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py @@ -0,0 +1,27 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.count_agency_per_url import \ + COUNT_AGENCY_PER_URL_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.unvalidated_urls_with_root import \ + UNVALIDATED_URLS_WITH_ROOT +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +CONSOLIDATED_CTE: CTE = ( + select( + UNVALIDATED_URLS_WITH_ROOT.c.url_id, + LinkURLAgency.agency_id, + COUNT_AGENCY_PER_URL_CTE.c.agency_count, + ) + .join( + COUNT_AGENCY_PER_URL_CTE, + COUNT_AGENCY_PER_URL_CTE.c.url_id == UNVALIDATED_URLS_WITH_ROOT.c.url_id + ) + .join( + LinkURLAgency, + LinkURLAgency.url_id == UNVALIDATED_URLS_WITH_ROOT.c.url_id + ) + .where( + COUNT_AGENCY_PER_URL_CTE.c.agency_count >= 1 + ) + .cte("consolidated") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py new file mode 100644 index 00000000..8607131c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, func, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +COUNT_AGENCY_PER_URL_CTE: CTE = ( + select( + META_ROOT_URLS_CTE.c.url_id, + func.count(LinkURLAgency.agency_id).label("agency_count") + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .group_by( + META_ROOT_URLS_CTE.c.url_id + ) + .cte("count_agency_per_url") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py new file mode 100644 index 00000000..b2c89748 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py @@ -0,0 +1,18 @@ +from sqlalchemy import CTE, select, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +MULTI_AGENCY_CASE_CTE: CTE = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + literal(100 / CONSOLIDATED_CTE.c.agency_count).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count > 1 + ) + .cte("multi_agency_case") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py new file mode 100644 index 00000000..05734184 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, CTE, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +SINGLE_AGENCY_CASE_CTE: CTE = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + literal(95).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_SINGLE_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count == 1 + ) + .cte("single_agency_case") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py index e69de29b..645a5200 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py @@ -0,0 +1,21 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + + +class GetHomepageMatchSubtaskURLsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + query = ( + select( + CONSOLIDATED_CTE.c.url_id, + ).distinct() + ) + + result: list[int] = await sh.scalars(session, query=query) + return result + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py index e69de29b..a33560ee 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py @@ -0,0 +1,4 @@ +from src.db.queries.base.builder import QueryBuilderBase + + +class InsertHomepageMatchSubtaskEntriesQueryBuilder(QueryBuilderBase): \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/enum.py b/src/db/models/impl/url/suggestion/agency/subtask/enum.py index 33730954..f3ee7c3f 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/enum.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/enum.py @@ -11,5 +11,4 @@ class SubtaskDetailCode(Enum): NO_DETAILS = "no details" RETRIEVAL_ERROR = "retrieval error" HOMEPAGE_SINGLE_AGENCY = "homepage-single agency" - HOMEPAGE_NO_DATA_SOURCES = "homepage-no data sources" - HOMEPAGE_MULTI_AGENCY_NONZERO_DATA_SOURCES = "homepage-multi agency nonzero data sources" \ No newline at end of file + HOMEPAGE_MULTI_AGENCY = "homepage-multi agency" \ No newline at end of file From cd4831569b66d47398873c8e41eee2ef8b77c55e Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 6 Sep 2025 14:03:48 -0400 Subject: [PATCH 24/33] Continue Draft --- .../subtasks/impl/homepage_match_/queries/insert.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py index a33560ee..18e95f20 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py @@ -1,4 +1,9 @@ from src.db.queries.base.builder import QueryBuilderBase -class InsertHomepageMatchSubtaskEntriesQueryBuilder(QueryBuilderBase): \ No newline at end of file +class InsertHomepageMatchSubtaskEntriesQueryBuilder(QueryBuilderBase): + # TODO: Write Insert for Subtasks + + # TODO: Write insert for Subtask entries + + # TODO: Do URL link \ No newline at end of file From d07dfe506e3d6f16fd3e608c0ff6ad298f17763c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 7 Sep 2025 09:55:01 -0400 Subject: [PATCH 25/33] Finish auto tests for homepage match --- .../subtasks/impl/homepage_match_/convert.py | 47 ++++++ .../subtasks/impl/homepage_match_/core.py | 60 ++++++- .../impl/homepage_match_/models/__init__.py | 0 .../impl/homepage_match_/models/entry.py | 10 ++ .../impl/homepage_match_/models/mapping.py | 6 + .../queries/ctes/consolidated.py | 11 +- .../queries/ctes/count_agency_per_url.py | 4 +- .../ctes/meta_urls_with_root_agencies.py | 2 +- .../queries/ctes/multi_agency_case.py | 5 +- .../queries/ctes/single_agency_case.py | 3 +- .../ctes/unvalidated_urls_with_root.py | 1 + .../impl/homepage_match_/queries/get.py | 38 +++-- .../impl/homepage_match_/queries/insert.py | 9 - .../queries/ctes/subtask/impl/homepage.py | 75 +-------- .../ineligible_cases/__init__.py | 0 .../ineligible_cases/test_blacklist.py | 51 ++++++ .../test_no_validated_meta_urls.py | 29 ++++ .../ineligible_cases/test_root_urls.py | 22 +++ .../subtasks/homepage_match/test_core.py | 6 - .../homepage_match/test_happy_path.py | 159 ++++++++++++++++++ tests/helpers/data_creator/core.py | 42 +++++ 21 files changed, 468 insertions(+), 112 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py new file mode 100644 index 00000000..f4ba913e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py @@ -0,0 +1,47 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode, AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +def convert_params_to_subtask_entries( + params: list[GetHomepageMatchParams], + task_id: int +) -> list[URLAutoAgencyIDSubtaskPydantic]: + url_id_to_detail_code: dict[int, SubtaskDetailCode] = {} + for param in params: + url_id_to_detail_code[param.url_id] = param.detail_code + + results: list[URLAutoAgencyIDSubtaskPydantic] = [] + for url_id, detail_code in url_id_to_detail_code.items(): + result = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + agencies_found=True, + detail=detail_code, + ) + results.append(result) + return results + +def convert_subtask_mappings_and_params_to_suggestions( + mappings: list[SubtaskURLMapping], + params: list[GetHomepageMatchParams] +) -> list[AgencyIDSubtaskSuggestionPydantic]: + url_id_to_subtask_id: dict[int, int] = { + mapping.url_id: mapping.subtask_id + for mapping in mappings + } + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for param in params: + subtask_id = url_id_to_subtask_id.get(param.url_id) + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=param.agency_id, + confidence=param.confidence, + ) + suggestions.append(suggestion) + return suggestions \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py index 745223d6..f335cb3a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py @@ -1,7 +1,63 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.convert import \ + convert_params_to_subtask_entries, convert_subtask_mappings_and_params_to_suggestions +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.get import \ + GetHomepageMatchSubtaskURLsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic -class HomepageMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): +class HomepageMatchSubtaskOperator( + AgencyIDSubtaskOperatorBase, +): async def inner_logic(self) -> None: - raise NotImplementedError() \ No newline at end of file + # Get Params + params: list[GetHomepageMatchParams] = \ + await self.adb_client.run_query_builder( + GetHomepageMatchSubtaskURLsQueryBuilder() + ) + + # Insert Subtask Entries + subtask_entries: list[URLAutoAgencyIDSubtaskPydantic] = convert_params_to_subtask_entries( + params=params, + task_id=self.task_id + ) + subtask_mappings: list[SubtaskURLMapping] = await self.insert_subtask_entries( + entries=subtask_entries + ) + + # Link URLs + url_ids: list[int] = [mapping.url_id for mapping in subtask_mappings] + self.linked_urls = url_ids + + # Insert Entries + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = convert_subtask_mappings_and_params_to_suggestions( + mappings=subtask_mappings, + params=params + ) + await self.adb_client.bulk_insert( + models=suggestions, + ) + + + async def insert_subtask_entries( + self, + entries: list[URLAutoAgencyIDSubtaskPydantic] + ) -> list[SubtaskURLMapping]: + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=entries, + return_ids=True + ) + mappings: list[SubtaskURLMapping] = [] + for subtask_id, entry in zip(subtask_ids, entries): + mapping = SubtaskURLMapping( + url_id=entry.url_id, + subtask_id=subtask_id, + ) + mappings.append(mapping) + return mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py new file mode 100644 index 00000000..6c65f9ad --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + + +class GetHomepageMatchParams(BaseModel): + url_id: int + agency_id: int + confidence: int = Field(..., ge=0, le=100) + detail_code: SubtaskDetailCode \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py new file mode 100644 index 00000000..2e4d2fbb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class SubtaskURLMapping(BaseModel): + url_id: int + subtask_id: int \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py index 993d109a..d90dfed6 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py @@ -2,23 +2,24 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.count_agency_per_url import \ COUNT_AGENCY_PER_URL_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root_agencies import \ + META_ROOT_URLS_WITH_AGENCIES from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.unvalidated_urls_with_root import \ UNVALIDATED_URLS_WITH_ROOT -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency CONSOLIDATED_CTE: CTE = ( select( UNVALIDATED_URLS_WITH_ROOT.c.url_id, - LinkURLAgency.agency_id, + META_ROOT_URLS_WITH_AGENCIES.c.agency_id, COUNT_AGENCY_PER_URL_CTE.c.agency_count, ) .join( COUNT_AGENCY_PER_URL_CTE, - COUNT_AGENCY_PER_URL_CTE.c.url_id == UNVALIDATED_URLS_WITH_ROOT.c.url_id + COUNT_AGENCY_PER_URL_CTE.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id ) .join( - LinkURLAgency, - LinkURLAgency.url_id == UNVALIDATED_URLS_WITH_ROOT.c.url_id + META_ROOT_URLS_WITH_AGENCIES, + META_ROOT_URLS_WITH_AGENCIES.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id ) .where( COUNT_AGENCY_PER_URL_CTE.c.agency_count >= 1 diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py index 8607131c..774787b7 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py @@ -6,7 +6,7 @@ COUNT_AGENCY_PER_URL_CTE: CTE = ( select( - META_ROOT_URLS_CTE.c.url_id, + META_ROOT_URLS_CTE.c.root_url_id, func.count(LinkURLAgency.agency_id).label("agency_count") ) .join( @@ -14,7 +14,7 @@ META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id ) .group_by( - META_ROOT_URLS_CTE.c.url_id + META_ROOT_URLS_CTE.c.root_url_id ) .cte("count_agency_per_url") ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py index bd388f8f..86b14ee4 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py @@ -6,7 +6,7 @@ META_ROOT_URLS_WITH_AGENCIES: CTE = ( select( - META_ROOT_URLS_CTE.c.url_id, + META_ROOT_URLS_CTE.c.meta_url_id, META_ROOT_URLS_CTE.c.root_url_id, LinkURLAgency.agency_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py index b2c89748..edf9e601 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py @@ -4,15 +4,14 @@ CONSOLIDATED_CTE from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode -MULTI_AGENCY_CASE_CTE: CTE = ( +MULTI_AGENCY_CASE_QUERY = ( select( CONSOLIDATED_CTE.c.url_id, CONSOLIDATED_CTE.c.agency_id, - literal(100 / CONSOLIDATED_CTE.c.agency_count).label("confidence"), + (literal(100) / CONSOLIDATED_CTE.c.agency_count).label("confidence"), literal(SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY.value).label("detail_code") ) .where( CONSOLIDATED_CTE.c.agency_count > 1 ) - .cte("multi_agency_case") ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py index 05734184..5778ecb6 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py @@ -4,7 +4,7 @@ CONSOLIDATED_CTE from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode -SINGLE_AGENCY_CASE_CTE: CTE = ( +SINGLE_AGENCY_CASE_QUERY = ( select( CONSOLIDATED_CTE.c.url_id, CONSOLIDATED_CTE.c.agency_id, @@ -14,5 +14,4 @@ .where( CONSOLIDATED_CTE.c.agency_count == 1 ) - .cte("single_agency_case") ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py index bdfaa046..46702833 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py @@ -18,4 +18,5 @@ WHITELISTED_ROOT_URLS_CTE, WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id ) + .cte("unvalidated_urls_with_root") ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py index 645a5200..10619531 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py @@ -1,21 +1,35 @@ -from sqlalchemy import select +from typing import Sequence + +from sqlalchemy import Select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ - CONSOLIDATED_CTE -from src.db.queries.base.builder import QueryBuilderBase +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.multi_agency_case import \ + MULTI_AGENCY_CASE_QUERY +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.single_agency_case import \ + SINGLE_AGENCY_CASE_QUERY from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode +from src.db.queries.base.builder import QueryBuilderBase class GetHomepageMatchSubtaskURLsQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[int]: - query = ( - select( - CONSOLIDATED_CTE.c.url_id, - ).distinct() - ) + async def run(self, session: AsyncSession) -> list[GetHomepageMatchParams]: + + query: Select = SINGLE_AGENCY_CASE_QUERY.union(MULTI_AGENCY_CASE_QUERY) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - result: list[int] = await sh.scalars(session, query=query) - return result + results: list[GetHomepageMatchParams] = [] + for mapping in mappings: + response = GetHomepageMatchParams( + url_id=mapping["url_id"], + agency_id=mapping["agency_id"], + confidence=mapping["confidence"], + detail_code=SubtaskDetailCode(mapping["detail_code"]), + ) + results.append(response) + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py deleted file mode 100644 index 18e95f20..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/insert.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.db.queries.base.builder import QueryBuilderBase - - -class InsertHomepageMatchSubtaskEntriesQueryBuilder(QueryBuilderBase): - # TODO: Write Insert for Subtasks - - # TODO: Write insert for Subtask entries - - # TODO: Do URL link \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py index cf109207..4d75b4e0 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -1,79 +1,21 @@ -from typing import Sequence - from sqlalchemy import select, exists +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ SubtaskCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query -from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL -from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from src.db.models.views.meta_url import MetaURL - -NOT_ROOT_URL_FLAG = ( - ~exists() - .where( - FlagRootURL.url_id == URL.id, - ) -) - -NOT_META_URL_FLAG = ( - ~exists() - .where( - MetaURL.url_id == URL.id, - ) -) - -BLACKLISTED_ROOTS: Sequence[str] = ( - 'https://www.facebook.com', - 'https://www.countyoffice.org', - '://', - 'https://www.usmarshals.gov', - 'https://www.mapquest.com', - 'https://catalog.data.gov', - 'https://www.muckrock.com' -) - -# Root URL must not be blacklisted -WHITELISTED_ROOT_URL = ( - select( - URL.id - ) - .join( - FlagRootURL, - FlagRootURL.url_id == URL.id, - ) - .where( - URL.url.notin_(BLACKLISTED_ROOTS), - ) - .cte("whitelisted_root_url") -) - -ROOT_URLS_WITH_META_URLS = ( - select( - WHITELISTED_ROOT_URL.c.id - ) - .where( - exists() - .where( - LinkURLRootURL.root_url_id == WHITELISTED_ROOT_URL.c.id, - LinkURLRootURL.url_id == MetaURL.url_id, - ) - ) - .cte("root_urls_with_meta_urls") -) -HAS_ROOT_URL_WITH_META_URLS = ( +VALID_URL_FLAG = ( exists() .where( - LinkURLRootURL.root_url_id == ROOT_URLS_WITH_META_URLS.c.id, - LinkURLRootURL.url_id == URL.id, + URL.id == CONSOLIDATED_CTE.c.url_id, ) ) - cte = ( select( URL.id, @@ -81,15 +23,8 @@ AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, ) ) - .join( - LinkURLRootURL, - LinkURLRootURL.url_id == URL.id, - ) .where( - NOT_META_URL_FLAG, - NOT_ROOT_URL_FLAG, - HAS_ROOT_URL_WITH_META_URLS, - + VALID_URL_FLAG, ) .cte("homepage_eligible") ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py new file mode 100644 index 00000000..05a9e2bb --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py @@ -0,0 +1,51 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_blacklist( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test Survey does not pick up for Homepage Match + URLs with root URLs that have more than two agencies + whose meta_urls have it as a root""" + # Create Root URL + root_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([root_url_id]) + + # Create ineligible URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Link Root URL to ineligible URL + await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) + + # Create Meta URLs + meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + count=3, + validation_type=URLValidatedType.META_URL + ) + + # Create 3 agencies + agency_ids: list[int] = await db_data_creator.create_agencies(count=3) + + # Link Meta URLs to Agencies + await db_data_creator.link_urls_to_agencies( + url_ids=[url.url_id for url in meta_urls], + agency_ids=agency_ids + ) + + # Link Meta URLs to Root URL + await db_data_creator.link_urls_to_root( + url_ids=[url.url_id for url in meta_urls], + root_url_id=root_url_id + ) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py new file mode 100644 index 00000000..a9576768 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py @@ -0,0 +1,29 @@ + +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_no_validated_meta_urls( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test survey does not pick up for Homepage Match + URLs whose Root URLs do not have validated meta URLs.""" + + # Create Root URL + root_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([root_url_id]) + + # Create ineligible URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Link Root URL to ineligible URL + await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py new file mode 100644 index 00000000..627dd05a --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py @@ -0,0 +1,22 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from tests.conftest import db_data_creator +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_root_urls( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test survey does not pick up root URLs for Homepage Match.""" + + # Create URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([url_id]) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py deleted file mode 100644 index a128bde1..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_core.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_homepage_match(): - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py new file mode 100644 index 00000000..43a1677c --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py @@ -0,0 +1,159 @@ +from collections import defaultdict + +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_homepage_match( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """ + Test the following cases: + Single Agency: A URL whose Root URL has one meta URL is properly linked + Multi Agency: A URL whose Root URL has multiple meta URLs is properly linked + """ + + # Create 2 root URLs + root_url_mappings: list[URLMapping] = ( + await db_data_creator.create_urls(count=2) + ) + root_url_ids: list[int] = [url_mapping.url_id for url_mapping in root_url_mappings] + + # Flag as Root + await db_data_creator.flag_as_root(root_url_ids) + + # Separate Root URLs + single_agency_root_url_id: int = root_url_ids[0] + multi_agency_root_url_id: int = root_url_ids[1] + + # Create 3 agencies + agency_ids: list[int] = await db_data_creator.create_agencies(count=3) + single_agency_id: int = agency_ids[0] + multi_agency_ids: list[int] = agency_ids[1:] + + # Create 1 Meta URL for single agency case + single_meta_url_id: int = (await db_data_creator.create_validated_urls( + count=1, + validation_type=URLValidatedType.META_URL + ))[0].url_id + # Link single meta URL to single agency + await db_data_creator.create_url_agency_links( + url_ids=[single_meta_url_id], + agency_ids=[single_agency_id]) + # Link single meta URL to root + await db_data_creator.link_urls_to_root( + url_ids=[single_meta_url_id], + root_url_id=single_agency_root_url_id + ) + + + # Create 2 Meta URLs and agencies for multi agency case + multi_meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + count=2, + validation_type=URLValidatedType.META_URL + ) + multi_meta_url_ids: list[int] = [url_mapping.url_id for url_mapping in multi_meta_urls] + # Link multi meta URLs to agencies + await db_data_creator.create_url_agency_links( + url_ids=[multi_meta_url_ids[0]], + agency_ids=[multi_agency_ids[0]] + ) + await db_data_creator.create_url_agency_links( + url_ids=[multi_meta_url_ids[1]], + agency_ids=[multi_agency_ids[1]] + ) + # Link multi meta URLs to root + await db_data_creator.link_urls_to_root( + url_ids=multi_meta_url_ids, + root_url_id=multi_agency_root_url_id + ) + + # Check operator does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Set up eligible URLs + eligible_urls: list[URLMapping] = await db_data_creator.create_urls( + count=2, + ) + single_url_id: int = eligible_urls[0].url_id + multi_url_id: int = eligible_urls[1].url_id + + # Link eligible URLs to each root + await db_data_creator.link_urls_to_root( + url_ids=[single_url_id], + root_url_id=single_agency_root_url_id + ) + await db_data_creator.link_urls_to_root( + url_ids=[multi_url_id], + root_url_id=multi_agency_root_url_id + ) + + # Check operator now meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.HOMEPAGE_MATCH + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + + # Confirm presence of subtasks + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 2 + + # Confirm both listed as agencies found + assert all(subtask.agencies_found for subtask in subtasks) + + url_id_to_subtask: dict[int, URLAutoAgencyIDSubtask] = { + subtask.url_id: subtask for subtask in subtasks + } + single_subtask: URLAutoAgencyIDSubtask = url_id_to_subtask[single_url_id] + multi_subtask: URLAutoAgencyIDSubtask = url_id_to_subtask[multi_url_id] + + # Check subtasks have expected detail codes + assert single_subtask.detail == SubtaskDetailCode.HOMEPAGE_SINGLE_AGENCY + assert multi_subtask.detail == SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY + + + # Get suggestions + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 3 + + # Confirm each suggestion properly linked to expected subtask + subtask_id_to_suggestions: dict[int, list[AgencyIDSubtaskSuggestion]] = defaultdict(list) + for suggestion in suggestions: + subtask_id_to_suggestions[suggestion.subtask_id].append(suggestion) + + # Check Single Agency Case Suggestion + single_suggestion: AgencyIDSubtaskSuggestion = \ + subtask_id_to_suggestions[single_subtask.id][0] + # Check Single Agency Case Suggestion has expected agency + assert single_suggestion.agency_id == single_agency_id + # Confirm confidence is 95 + assert single_suggestion.confidence == 95 + + # Check Multi Agency Case Suggestion + multi_suggestions: list[AgencyIDSubtaskSuggestion] = subtask_id_to_suggestions[multi_subtask.id] + # Check Multi Agency Case Suggestion has expected agencies + assert {suggestion.agency_id for suggestion in multi_suggestions} \ + == set(multi_agency_ids) + # Confirm confidence for each is 50 + assert all(suggestion.confidence == 50 for suggestion in multi_suggestions) + + # Test operator no longer meets prerequisites + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 6c597f3f..57ee3576 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -9,8 +9,10 @@ from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient @@ -21,6 +23,7 @@ from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.counter import next_int from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand from tests.helpers.data_creator.commands.impl.batch import DBDataCreatorBatchCommand @@ -503,3 +506,42 @@ async def create_agency(self, agency_id: int = 1) -> None: ) await self.adb_client.add_all([agency]) + async def create_agencies(self, count: int = 3) -> list[int]: + agencies: list[Agency] = [] + agency_ids: list[int] = [] + for _ in range(count): + agency_id = next_int() + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + agencies.append(agency) + agency_ids.append(agency_id) + await self.adb_client.add_all(agencies) + return agency_ids + + async def flag_as_root(self, url_ids: list[int]) -> None: + flag_root_urls: list[FlagRootURL] = [ + FlagRootURL(url_id=url_id) for url_id in url_ids + ] + await self.adb_client.add_all(flag_root_urls) + + async def link_urls_to_root(self, url_ids: list[int], root_url_id: int) -> None: + links: list[LinkURLRootURL] = [ + LinkURLRootURL(url_id=url_id, root_url_id=root_url_id) for url_id in url_ids + ] + await self.adb_client.add_all(links) + + async def link_urls_to_agencies(self, url_ids: list[int], agency_ids: list[int]) -> None: + assert len(url_ids) == len(agency_ids) + links: list[LinkURLAgency] = [] + for url_id, agency_id in zip(url_ids, agency_ids): + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id + ) + links.append(link) + await self.adb_client.add_all(links) \ No newline at end of file From ef12a5c5e17d414af69b7ccd0c6644ea4be7c599 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 8 Sep 2025 07:59:48 -0400 Subject: [PATCH 26/33] Add framework of test for nlp --- .../subtasks/nlp_location_match/test_core.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py index 19f5eb5b..75eacd59 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py @@ -1,6 +1,24 @@ import pytest +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from tests.helpers.data_creator.core import DBDataCreator + @pytest.mark.asyncio -async def test_nlp_location_match(): - raise NotImplementedError \ No newline at end of file +async def test_nlp_location_match( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator +): + + # Create 2 URLs with compressed HTML + url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_ids: list[int] = [url.url_id for url in url_mappings] + await db_data_creator.html_data(url_ids=url_ids) + + # Confirm operator meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + # raise NotImplementedError \ No newline at end of file From 0471f15f2f0a2ce82ca6e5380f8b72d157829e27 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 8 Sep 2025 11:54:03 -0400 Subject: [PATCH 27/33] Continue Draft --- src/external/pdap/client.py | 85 ++++++++++++------- .../dtos/search_agency_by_location/params.py | 3 +- .../search_agency_by_location/response.py | 8 +- .../pdap/test_sc_agency_search_location.py | 34 ++++++++ 4 files changed, 92 insertions(+), 38 deletions(-) create mode 100644 tests/manual/external/pdap/test_sc_agency_search_location.py diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index a6abb785..19606b84 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,6 +1,7 @@ +from datetime import date from typing import Optional, Any -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType +from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters @@ -29,6 +30,28 @@ async def search_agency_by_location( self, params: list[SearchAgencyByLocationParams] ) -> list[SearchAgencyByLocationResponse]: + request_url: str = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=["agencies", "search", "location"] + ) + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + + json_params: list[dict[str, Any]] = [ + param.model_dump(mode='json') + for param in params + ] + + request_info = RequestInfo( + type_=RequestType.POST, + url=request_url, + headers=headers, + json_={ + "requests": json_params + } + ) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + raise NotImplementedError async def match_agency( @@ -41,13 +64,13 @@ async def match_agency( """ Returns agencies, if any, that match or partially match the search criteria """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.MATCH, subdomains=["agency"] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" request_info = RequestInfo( type_=RequestType.POST, url=url, @@ -59,15 +82,15 @@ async def match_agency( "locality": locality } ) - response_info = await self.access_manager.make_request(request_info) - matches = [] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + matches: list[MatchAgencyInfo] = [] for agency in response_info.data["agencies"]: mai = MatchAgencyInfo( id=agency['id'], submitted_name=agency['name'] ) if len(agency['locations']) > 0: - first_location = agency['locations'][0] + first_location: dict[str, Any] = agency['locations'][0] mai.state = first_location['state'] mai.county = first_location['county'] mai.locality = first_location['locality'] @@ -85,7 +108,7 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.CHECK, subdomains=["unique-url"] ) @@ -96,9 +119,11 @@ async def is_url_duplicate( "url": url_to_check } ) - response_info = await self.access_manager.make_request(request_info) - duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] - is_duplicate = (len(duplicates) != 0) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + duplicates: list[UniqueURLDuplicateInfo] = [ + UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"] + ] + is_duplicate: bool = (len(duplicates) != 0) return is_duplicate async def submit_urls( @@ -115,11 +140,11 @@ async def submit_urls( ) # Build url-id dictionary - url_id_dict = {} + url_id_dict: dict[str, int] = {} for tdo in tdos: url_id_dict[tdo.url] = tdo.url_id - data_sources_json = [] + data_sources_json: list[dict[str, Any]] = [] for tdo in tdos: data_sources_json.append( { @@ -135,7 +160,7 @@ async def submit_urls( } ) - headers = await self.access_manager.jwt_header() + headers: dict[str, str] = await self.access_manager.jwt_header() request_info = RequestInfo( type_=RequestType.POST, url=request_url, @@ -144,12 +169,12 @@ async def submit_urls( "data_sources": data_sources_json } ) - response_info = await self.access_manager.make_request(request_info) - data_sources_response_json = response_info.data["data_sources"] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + data_sources_response_json: list[dict[str, Any]] = response_info.data["data_sources"] - results = [] + results: list[SubmittedURLInfo] = [] for data_source in data_sources_response_json: - url = data_source["url"] + url: str = data_source["url"] response_object = SubmittedURLInfo( url_id=url_id_dict[url], data_source_id=data_source["data_source_id"], @@ -163,20 +188,20 @@ async def sync_agencies( self, params: AgencySyncParameters ) -> AgenciesSyncResponseInfo: - url =self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "agencies", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" request_params: dict[str, Any] = { "page": params.page } if params.cutoff_date is not None: - params["updated_at"] = params.cutoff_date + params["updated_at"]: date = params.cutoff_date request_info = RequestInfo( type_=RequestType.GET, @@ -184,7 +209,7 @@ async def sync_agencies( headers=headers, params=request_params ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return AgenciesSyncResponseInfo( agencies=[ AgenciesSyncResponseInnerInfo(**entry) @@ -196,18 +221,18 @@ async def sync_data_sources( self, params: DataSourcesSyncParameters ) -> DataSourcesSyncResponseInfo: - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "data-sources", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" - params_dict = {"page": params.page} + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + params_dict: dict[str, Any] = {"page": params.page} if params.cutoff_date is not None: - params_dict["updated_at"] = params.cutoff_date + params_dict["updated_at"]: date = params.cutoff_date request_info = RequestInfo( type_=RequestType.GET, @@ -215,10 +240,10 @@ async def sync_data_sources( headers=headers, params=params_dict ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return DataSourcesSyncResponseInfo( data_sources=[ DataSourcesSyncResponseInnerInfo(**entry) for entry in response_info.data["data_sources"] ] - ) \ No newline at end of file + ) diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py index 855c9a76..800fa881 100644 --- a/src/external/pdap/dtos/search_agency_by_location/params.py +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -3,5 +3,4 @@ class SearchAgencyByLocationParams(BaseModel): request_id: int - state_iso: str | None - locations: list[str] \ No newline at end of file + query: str \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py index 7f786c89..d894b2d8 100644 --- a/src/external/pdap/dtos/search_agency_by_location/response.py +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -1,10 +1,6 @@ from pydantic import BaseModel, Field - -class SearchAgencyByLocationResult(BaseModel): - agency_id: int - similarity: float = Field(ge=0, le=1) - class SearchAgencyByLocationResponse(BaseModel): request_id: int - results: list[SearchAgencyByLocationResult] \ No newline at end of file + agency_id: int + similarity: float = Field(ge=0, le=1) \ No newline at end of file diff --git a/tests/manual/external/pdap/test_sc_agency_search_location.py b/tests/manual/external/pdap/test_sc_agency_search_location.py new file mode 100644 index 00000000..9b0aac28 --- /dev/null +++ b/tests/manual/external/pdap/test_sc_agency_search_location.py @@ -0,0 +1,34 @@ +""" + +Location ID, Agency ID +10464,9873, "Boonsboro, Washington, Maryland" +15648,9878, "Smithsburg, Washington, Maryland" +15656,9879, "Williamsport, Washington, Maryland" + +""" +import pytest + +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +@pytest.mark.asyncio +async def test_sc_agency_search_location(pdap_client_dev: PDAPClient): + params: list[SearchAgencyByLocationParams] = [ + SearchAgencyByLocationParams( + request_id=1, + query="Boonsboro, Washington, Maryland" + ), + SearchAgencyByLocationParams( + request_id=0, + query="Smithsburg, Washington, Maryland" + ), + SearchAgencyByLocationParams( + request_id=-99, + query="Williamsport, Washington, Maryland" + ) + ] + response: list[SearchAgencyByLocationResponse] = await pdap_client_dev.search_agency_by_location(params) + print(response) + From 0346817cd36b8155816b03672026ac0b68ed3c29 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 8 Sep 2025 22:18:22 -0400 Subject: [PATCH 28/33] Continue draft --- Dockerfile | 2 + ENV.md | 5 +- pyproject.toml | 1 + src/api/main.py | 7 ++ src/core/tasks/url/loader.py | 5 +- .../operators/agency_identification/core.py | 8 +- .../processor_ => flags}/__init__.py | 0 .../subtasks/flags/core.py | 26 ++++++ .../subtasks/flags/mappings.py | 8 ++ .../subtasks/impl/ckan_/query.py | 19 ++-- .../subtasks/impl/muckrock_/query.py | 22 ++--- .../subtasks/impl/nlp_location_match_/core.py | 59 ++++-------- .../models => processor}/__init__.py | 0 .../{ => processor}/convert.py | 35 ++++++-- .../nlp_location_match_/processor/core.py | 89 +++++++++++++++++++ .../nlp_location_match_/processor/counter.py | 11 +++ .../nlp_location_match_/processor/mapper.py | 10 +++ .../processor/nlp/__init__.py | 0 .../{processor_ => processor/nlp}/check.py | 2 +- .../processor/nlp/constants.py | 3 + .../{processor_ => processor/nlp}/convert.py | 4 +- .../{processor_ => processor/nlp}/core.py | 39 +++++--- .../processor/nlp/enums.py | 8 ++ .../processor/nlp/extract.py | 25 ++++++ .../{processor_ => processor/nlp}/mappings.py | 0 .../processor/nlp/models/__init__.py | 0 .../nlp}/models/params.py | 0 .../processor/nlp/models/response.py | 17 ++++ .../nlp}/models/us_state.py | 0 .../processor_/models/response.py | 9 -- .../impl/nlp_location_match_/query.py | 35 +++++--- .../agency_identification/subtasks/loader.py | 12 ++- .../subtasks/queries/survey/core.py | 22 ----- .../subtasks/queries/survey/queries/core.py | 24 ++++- .../queries/survey/queries/ctes/eligible.py | 57 ++++++++---- .../queries/survey/queries/eligible_counts.py | 13 +-- src/external/pdap/client.py | 13 +-- .../search_agency_by_location/response.py | 10 ++- .../impl/agency_identification/conftest.py | 7 +- .../nlp_location_match/end_to_end/__init__.py | 0 .../nlp_location_match/end_to_end/conftest.py | 15 ++++ .../end_to_end/test_core.py | 29 ++++++ .../end_to_end/test_no_results.py | 0 .../end_to_end/test_results.py | 0 .../internal_processor/__init__.py | 0 .../subtasks/nlp_location_match/test_core.py | 24 ----- .../agency_identification/survey/__init__.py | 0 .../survey/test_survey_flag.py | 49 ++++++++++ .../integration/tasks/url/loader/conftest.py | 4 +- .../agency_identifier/test_nlp_processor.py | 22 +++++ uv.lock | 11 +++ 51 files changed, 561 insertions(+), 200 deletions(-) rename src/core/tasks/url/operators/agency_identification/subtasks/{impl/nlp_location_match_/processor_ => flags}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_/models => processor}/__init__.py (100%) rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{ => processor}/convert.py (68%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/check.py (82%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/convert.py (84%) rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/core.py (54%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/mappings.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/models/params.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor_ => processor/nlp}/models/us_state.py (100%) delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_no_results.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_results.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/survey/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py create mode 100644 tests/manual/agency_identifier/test_nlp_processor.py diff --git a/Dockerfile b/Dockerfile index 85931528..e96272b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev # Must call from the root directory because uv does not add playwright to path RUN playwright install-deps chromium RUN playwright install chromium +# Download Spacy Model +RUN python -m spacy download en_core_web_sm # Copy project files COPY src ./src diff --git a/ENV.md b/ENV.md index 4085fcd6..c0df0c2d 100644 --- a/ENV.md +++ b/ENV.md @@ -53,7 +53,10 @@ The following flags are available: | `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | | `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | | `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | - +| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. | +| `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. | +| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. | +| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. | ## Foreign Data Wrapper (FDW) diff --git a/pyproject.toml b/pyproject.toml index 9da9a0f5..afe4a89a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "marshmallow~=3.23.2", "openai~=1.60.1", "pdap-access-manager==0.3.6", + "pip>=25.2", "playwright~=1.49.1", "psycopg2-binary~=2.9.6", "psycopg[binary]~=3.1.20", diff --git a/src/api/main.py b/src/api/main.py index b6679827..f17c147f 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -27,6 +27,10 @@ from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient @@ -83,6 +87,9 @@ async def lifespan(app: FastAPI): session=session, token=env_var_manager.hf_inference_api_key ), + nlp_processor=NLPProcessor( + model_type=SpacyModelType.EN_CORE_WEB_SM + ) ), ) async_collector_manager = AsyncCollectorManager( diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 8b5a18c1..91b52f50 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,6 +7,8 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator @@ -20,7 +22,6 @@ from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -35,11 +36,13 @@ def __init__( pdap_client: PDAPClient, muckrock_api_interface: MuckrockAPIInterface, hf_inference_client: HuggingFaceInferenceClient, + nlp_processor: NLPProcessor ): # Dependencies self.adb_client = adb_client self.url_request_interface = url_request_interface self.html_parser = html_parser + self.nlp_processor = nlp_processor self.env = Env() # External clients and interfaces diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index f5a84061..d4f5f87c 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,5 +1,6 @@ from src.core.tasks.mixins.link_urls import LinkURLsMixin from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError +from src.core.tasks.url.operators.agency_identification.subtasks.flags.core import SubtaskFlagger from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ @@ -34,9 +35,14 @@ async def meets_task_prerequisites(self) -> bool: Modifies: - self._subtask """ + flagger = SubtaskFlagger() + allowed_subtasks: list[AutoAgencyIDSubtaskType] = flagger.get_allowed_subtasks() + next_subtask: AutoAgencyIDSubtaskType | None = \ await self.adb_client.run_query_builder( - AgencyIDSubtaskSurveyQueryBuilder() + AgencyIDSubtaskSurveyQueryBuilder( + allowed_subtasks=allowed_subtasks + ) ) self._subtask = next_subtask if next_subtask is None: diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py new file mode 100644 index 00000000..41997322 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py @@ -0,0 +1,26 @@ + +from environs import Env + +from src.core.tasks.url.operators.agency_identification.subtasks.flags.mappings import SUBTASK_TO_ENV_FLAG +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +class SubtaskFlagger: + """ + Manages flags allowing and disallowing subtasks + """ + def __init__(self): + self.env = Env() + + def _get_subtask_flag(self, subtask_type: AutoAgencyIDSubtaskType) -> bool: + return self.env.bool( + SUBTASK_TO_ENV_FLAG[subtask_type], + default=True + ) + + def get_allowed_subtasks(self) -> list[AutoAgencyIDSubtaskType]: + return [ + subtask_type + for subtask_type, flag in SUBTASK_TO_ENV_FLAG.items() + if self._get_subtask_flag(subtask_type) + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py new file mode 100644 index 00000000..d6997423 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py @@ -0,0 +1,8 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +SUBTASK_TO_ENV_FLAG: dict[AutoAgencyIDSubtaskType, str] = { + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: "AGENCY_ID_HOMEPAGE_MATCH_FLAG", + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: "AGENCY_ID_NLP_LOCATION_MATCH_FLAG", + AutoAgencyIDSubtaskType.CKAN: "AGENCY_ID_CKAN_FLAG", + AutoAgencyIDSubtaskType.MUCKROCK: "AGENCY_ID_MUCKROCK_FLAG" +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py index 86160a10..90e965e7 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py @@ -5,6 +5,8 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -18,25 +20,18 @@ async def run( self, session: AsyncSession ) -> list[CKANAgencyIDSubtaskParams]: + container = EligibleContainer() query = ( select( - URL.id, + container.url_id, URL.collector_metadata ) .join( - LinkBatchURL, - LinkBatchURL.url_id == URL.id, - ) - .join( - Batch, - Batch.id == LinkBatchURL.batch_id, + URL, + URL.id == container.url_id, ) .where( - Batch.strategy.in_( - ( - CollectorType.CKAN.value, - ) - ), + container.ckan, ) .limit(500) ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py index 5c292f37..6f575b4f 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py @@ -6,6 +6,8 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -19,27 +21,19 @@ async def run( self, session: AsyncSession ) -> list[MuckrockAgencyIDSubtaskParams]: + container = EligibleContainer() + query = ( select( - URL.id, + container.url_id, URL.collector_metadata ) .join( - LinkBatchURL, - LinkBatchURL.url_id == URL.id, - ) - .join( - Batch, - Batch.id == LinkBatchURL.batch_id, + URL, + URL.id == container.url_id, ) .where( - Batch.strategy.in_( - ( - CollectorType.MUCKROCK_ALL_SEARCH.value, - CollectorType.MUCKROCK_COUNTY_SEARCH.value, - CollectorType.MUCKROCK_SIMPLE_SEARCH.value, - ) - ), + container.muckrock, ) .limit(500) ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py index 3999cc42..6aeec35e 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -1,20 +1,17 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ ITERATIONS_PER_SUBTASK -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.convert import \ - convert_nlp_response_to_search_agency_by_location_params, convert_search_agency_responses_to_subtask_data_list from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.core import NLPProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ - NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query import \ GetNLPLocationMatchSubtaskInputQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse class NLPLocationMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): @@ -26,9 +23,12 @@ def __init__( pdap_client: PDAPClient, processor: NLPProcessor ) -> None: - super().__init__(adb_client, task_id) - self.processor = processor - self.pdap_client = pdap_client + super().__init__(adb_client, task_id=task_id) + self.processor = AgencyIDSubtaskInternalProcessor( + nlp_processor=processor, + pdap_client=pdap_client, + task_id=task_id, + ) async def inner_logic(self) -> None: for iteration in range(ITERATIONS_PER_SUBTASK): @@ -38,40 +38,19 @@ async def inner_logic(self) -> None: await self.run_subtask_iteration(inputs) async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: - search_params: list[SearchAgencyByLocationParams] = [] - for input_ in inputs: - nlp_response: NLPLocationMatchResponse = await self._get_location_match(input_.html) - search_param: SearchAgencyByLocationParams = \ - convert_nlp_response_to_search_agency_by_location_params( - url_id=input_.url_id, - nlp_response=nlp_response, - ) - search_params.append(search_param) - - search_responses: list[SearchAgencyByLocationResponse] = \ - await self._get_pdap_info(search_params) - - subtask_data_list: list[AutoAgencyIDSubtaskData] = \ - convert_search_agency_responses_to_subtask_data_list( - responses=search_responses, - task_id=self.task_id, - ) + subtask_data_list: list[AutoAgencyIDSubtaskData] = await self._process_inputs(inputs) await self._upload_subtask_data(subtask_data_list) + async def _process_inputs( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + return await self.processor.process( + inputs=inputs, + ) + async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: return await self.adb_client.run_query_builder( query_builder=GetNLPLocationMatchSubtaskInputQueryBuilder(), ) - - async def _get_pdap_info( - self, - params: list[SearchAgencyByLocationParams] - ) -> list[SearchAgencyByLocationResponse]: - return await self.pdap_client.search_agency_by_location(params) - - async def _get_location_match( - self, - html: str - ) -> NLPLocationMatchResponse: - return self.processor.parse_for_locations(html) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py similarity index 68% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py index 64f299fe..3e0924ba 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -1,6 +1,9 @@ from math import ceil -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ + URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ NLPLocationMatchResponse from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion @@ -11,35 +14,49 @@ def convert_nlp_response_to_search_agency_by_location_params( - url_id: int, nlp_response: NLPLocationMatchResponse, -) -> SearchAgencyByLocationParams: - return SearchAgencyByLocationParams( - request_id=url_id, - locations=nlp_response.locations, - state_iso=nlp_response.us_state.iso, - ) + counter: RequestCounter +) -> list[SearchAgencyByLocationParams]: + params: list[SearchAgencyByLocationParams] = [] + for location in nlp_response.locations: + if nlp_response.us_state is not None: + query: str = f"{location}, {nlp_response.us_state.name}" + else: + query: str = location + request_id: int = counter.next() + param = SearchAgencyByLocationParams( + request_id=request_id, + query=query + ) + params.append(param) + + return params + + def convert_search_agency_responses_to_subtask_data_list( + mapper: URLRequestIDMapper, responses: list[SearchAgencyByLocationResponse], task_id: int ) -> list[AutoAgencyIDSubtaskData]: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for response in responses: + url_id: int = mapper.get_url_id_by_request_id(response.request_id) subtask_data: AutoAgencyIDSubtaskData = \ convert_search_agency_response_to_subtask_data( response=response, task_id=task_id, + url_id=url_id, ) subtask_data_list.append(subtask_data) return subtask_data_list def convert_search_agency_response_to_subtask_data( + url_id: int, response: SearchAgencyByLocationResponse, task_id: int ) -> AutoAgencyIDSubtaskData: suggestions: list[AgencySuggestion] = [] - url_id: int = response.request_id for result in response.results: agency_id: int = result.agency_id similarity: float = result.similarity diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py new file mode 100644 index 00000000..f283ca7b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -0,0 +1,89 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ + convert_nlp_response_to_search_agency_by_location_params, convert_search_agency_responses_to_subtask_data_list +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ + URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class AgencyIDSubtaskInternalProcessor: + + def __init__( + self, + nlp_processor: NLPProcessor, + pdap_client: PDAPClient, + task_id: int, + ): + self._nlp_processor = nlp_processor + self._pdap_client = pdap_client + self._counter = RequestCounter() + self._mapper = URLRequestIDMapper() + self._task_id = task_id + + async def process( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + + search_params: list[SearchAgencyByLocationParams] = self._extract_search_params( + inputs=inputs + ) + + search_responses: list[SearchAgencyByLocationResponse] = \ + await self._get_pdap_info(search_params) + + subtask_data_list: list[AutoAgencyIDSubtaskData] = \ + convert_search_agency_responses_to_subtask_data_list( + responses=search_responses, + task_id=self._task_id, + mapper=self._mapper, + ) + + return subtask_data_list + + def _extract_search_params( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[SearchAgencyByLocationParams]: + """ + Modifies: + - self._mapper + - self._counter + """ + all_search_params: list[SearchAgencyByLocationParams] = [] + for input_ in inputs: + nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) + search_params: list[ + SearchAgencyByLocationParams] = convert_nlp_response_to_search_agency_by_location_params( + counter=self._counter, + nlp_response=nlp_response, + ) + for search_param in search_params: + self._mapper.add_mapping( + request_id=search_param.request_id, + url_id=input_.url_id, + ) + search_params.append(search_param) + all_search_params.extend(search_params) + return all_search_params + + def _get_location_match( + self, + html: str + ) -> NLPLocationMatchResponse: + return self._nlp_processor.parse_for_locations(html) + + async def _get_pdap_info( + self, + params: list[SearchAgencyByLocationParams] + ) -> list[SearchAgencyByLocationResponse]: + return await self._pdap_client.search_agency_by_location(params) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py new file mode 100644 index 00000000..12e9e048 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py @@ -0,0 +1,11 @@ + + + +class RequestCounter: + + def __init__(self): + self._counter: int = 0 + + def next(self) -> int: + self._counter += 1 + return self._counter \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py new file mode 100644 index 00000000..8192dbb6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py @@ -0,0 +1,10 @@ +class URLRequestIDMapper: + + def __init__(self): + self._request_id_to_url_id_mapper: dict[int, int] = {} + + def add_mapping(self, request_id: int, url_id: int) -> None: + self._request_id_to_url_id_mapper[request_id] = url_id + + def get_url_id_by_request_id(self, request_id: int) -> int: + return self._request_id_to_url_id_mapper[request_id] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py similarity index 82% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py index 2019cbcf..ef60e038 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/check.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.mappings import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py new file mode 100644 index 00000000..267f728b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py @@ -0,0 +1,3 @@ + + +TOP_N_LOCATIONS_COUNT: int = 5 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py similarity index 84% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py index f29bb11b..040bc466 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py @@ -1,6 +1,6 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.mappings import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ USState diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py similarity index 54% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py index 45b8d235..442585f2 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py @@ -1,16 +1,20 @@ from collections import Counter -from typing import Mapping +import spacy from spacy import Language from spacy.tokens import Doc -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.check import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.check import \ is_name_us_state, is_iso_us_state -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.convert import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.convert import \ convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.extract import \ + extract_most_common_us_state, extract_top_n_locations +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ USState @@ -18,12 +22,20 @@ class NLPProcessor: def __init__( self, - model: Language + model_type: SpacyModelType = SpacyModelType.EN_CORE_WEB_SM ): - self._model: Language = model + self._model_type: SpacyModelType = model_type + self._model: Language | None = None + + def lazy_load_model(self) -> Language: + if self._model is None: + self._model = spacy.load(self._model_type.value, disable=['parser']) + return self._model + def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: - doc: Doc = self._model(html) + model: Language = self.lazy_load_model() + doc: Doc = model(html) us_state_counter: Counter[USState] = Counter() location_counter: Counter[str] = Counter() @@ -43,15 +55,14 @@ def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: continue location_counter[text] += 1 - most_common_us_state: USState | None = us_state_counter.most_common(1)[0][0] - top_5_locations_raw: list[tuple[str, int]] = location_counter.most_common(5) - top_5_locations: list[str] = [] - for location, _ in top_5_locations_raw: - top_5_locations.append(location) + # Get most common US State if exists + most_common_us_state: USState | None = extract_most_common_us_state(us_state_counter) + + top_n_locations: list[str] = extract_top_n_locations(location_counter) return NLPLocationMatchResponse( us_state=most_common_us_state, - locations=top_5_locations + locations=top_n_locations ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py new file mode 100644 index 00000000..9d1b987b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class SpacyModelType(Enum): + EN_CORE_WEB_SM = "en_core_web_sm" + EN_CORE_WEB_LG = "en_core_web_lg" + EN_CORE_WEB_MD = "en_core_web_md" + EN_CORE_WEB_TRF = "en_core_web_trf" \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py new file mode 100644 index 00000000..ea732ef0 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py @@ -0,0 +1,25 @@ +from collections import Counter + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + TOP_N_LOCATIONS_COUNT +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +def extract_most_common_us_state( + us_state_counter: Counter[USState] +) -> USState | None: + try: + return us_state_counter.most_common(1)[0][0] + except IndexError: + return None + +def extract_top_n_locations( + location_counter: Counter[str] +) -> list[str]: + top_n_locations_raw: list[tuple[str, int]] = \ + location_counter.most_common(TOP_N_LOCATIONS_COUNT) + top_n_locations: list[str] = [] + for location, _ in top_n_locations_raw: + top_n_locations.append(location) + return top_n_locations \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/mappings.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/params.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py new file mode 100644 index 00000000..23904bdf --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +class NLPLocationMatchResponse(BaseModel): + locations: list[str] + us_state: USState | None + + @property + def empty(self) -> bool: + if self.us_state is not None: + return False + if len(self.locations) > 0: + return False + return True diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/us_state.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py deleted file mode 100644 index bd536dd5..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor_/models/response.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.us_state import \ - USState - - -class NLPLocationMatchResponse(BaseModel): - locations: list[str] - us_state: USState | None \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py index 7544ebaa..db82b22d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py @@ -1,13 +1,18 @@ -from typing import Any +from typing import Sequence -from sqlalchemy import select +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + NUMBER_OF_ENTRIES_PER_ITERATION from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ NLPLocationMatchSubtaskInput -from src.db.models.impl.url.core.sqlalchemy import URL +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase +from src.db.utils.compression import decompress_html class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): @@ -16,21 +21,29 @@ async def run( self, session: AsyncSession ) -> list[NLPLocationMatchSubtaskInput]: - + container = EligibleContainer() query = ( select( - URL.id, + container.url_id, URLCompressedHTML.compressed_html ) .join( URLCompressedHTML, - URLCompressedHTML.url_id == URL.id + URLCompressedHTML.url_id == container.url_id, + ) + .where( + container.nlp_location, ) + .limit(NUMBER_OF_ENTRIES_PER_ITERATION) ) - # TODO: Add additional joins and where conditions - # TODO: Maybe leverage CTEs from survey query to get the precise URL ids - # without having to redo the logic here - + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + inputs: list[NLPLocationMatchSubtaskInput] = [ + NLPLocationMatchSubtaskInput( + url_id=mapping["url_id"], + html=decompress_html(mapping["compressed_html"]), + ) + for mapping in mappings + ] + return inputs - # TODO: Add limit leveraging NUMBER_OF_ENTRIES_PER_ITERATION constant diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 31c6fbec..850650c5 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,5 +1,3 @@ -import spacy - from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.core import \ @@ -8,7 +6,7 @@ MuckrockAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ NLPLocationMatchSubtaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient @@ -23,10 +21,12 @@ def __init__( self, pdap_client: PDAPClient, muckrock_api_interface: MuckrockAPIInterface, - adb_client: AsyncDatabaseClient + adb_client: AsyncDatabaseClient, + nlp_processor: NLPProcessor ): self._pdap_client = pdap_client self._muckrock_api_interface = muckrock_api_interface + self._nlp_processor = nlp_processor self.adb_client = adb_client def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: @@ -55,9 +55,7 @@ def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubt task_id=task_id, adb_client=self.adb_client, pdap_client=self._pdap_client, - processor=NLPProcessor( - spacy.load('en_core_web_trf', disable=['parser']) - ) + processor=self._nlp_processor ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py deleted file mode 100644 index 57f30fc3..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/core.py +++ /dev/null @@ -1,22 +0,0 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ - AgencyIDSubtaskSurveyQueryBuilder -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType - - -class AgencyIDSubtaskPlanner: - - def __init__( - self, - adb_client: AsyncDatabaseClient, - ) -> None: - self.adb_client = adb_client - - async def plan_next_subtask(self) -> AutoAgencyIDSubtaskType | None: - - next_subtask: AutoAgencyIDSubtaskType | None = \ - await self.adb_client.run_query_builder( - AgencyIDSubtaskSurveyQueryBuilder() - ) - return next_subtask - diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py index bcee8ccb..2b81d2de 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py @@ -26,14 +26,25 @@ class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): (or an empty list if no subtasks have applicable URLs) """ + def __init__( + self, + allowed_subtasks: list[AutoAgencyIDSubtaskType] + ): + super().__init__() + self._allowed_subtasks = allowed_subtasks + async def run(self, session: AsyncSession) -> AutoAgencyIDSubtaskType | None: results: RowMapping = await sh.mapping(session, ELIGIBLE_COUNTS_QUERY) counts: Counter[str] = Counter(results) - max_count: int = max(counts.values()) + + allowed_counts: Counter[str] = await self._filter_allowed_counts(counts) + if len(allowed_counts) == 0: + return None + max_count: int = max(allowed_counts.values()) if max_count == 0: return None subtasks_with_max_count: list[str] = [ - subtask for subtask, count in counts.items() + subtask for subtask, count in allowed_counts.items() if count == max_count ] subtasks_as_enum_list: list[AutoAgencyIDSubtaskType] = [ @@ -49,6 +60,15 @@ async def run(self, session: AsyncSession) -> AutoAgencyIDSubtaskType | None: # Return the highest priority subtask return sorted_subtasks[0] + async def _filter_allowed_counts(self, counts: Counter[str]) -> Counter[str]: + return Counter( + { + subtask: count + for subtask, count in counts.items() + if AutoAgencyIDSubtaskType(subtask) in self._allowed_subtasks + } + ) + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py index 9b0c835e..5be64fbc 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py @@ -1,4 +1,4 @@ -from sqlalchemy import select +from sqlalchemy import select, CTE, Column from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.high_confidence_annotations import \ HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER @@ -14,17 +14,44 @@ NLP_LOCATION_CONTAINER from src.db.models.impl.url.core.sqlalchemy import URL -ELIGIBLE_CTE = ( - select( - URL.id, - CKAN_SUBTASK_CONTAINER.eligible_query.label("ckan"), - MUCKROCK_SUBTASK_CONTAINER.eligible_query.label("muckrock"), - HOMEPAGE_SUBTASK_CONTAINER.eligible_query.label("homepage"), - NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), - ) - .where( - HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, - VALIDATED_EXISTS_CONTAINER.not_exists_query, - ) - .cte("eligible") -) \ No newline at end of file +class EligibleContainer: + + def __init__(self): + self._cte = ( + select( + URL.id, + CKAN_SUBTASK_CONTAINER.eligible_query.label("ckan"), + MUCKROCK_SUBTASK_CONTAINER.eligible_query.label("muckrock"), + HOMEPAGE_SUBTASK_CONTAINER.eligible_query.label("homepage"), + NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), + ) + .where( + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, + VALIDATED_EXISTS_CONTAINER.not_exists_query, + ) + .cte("eligible") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c['id'] + + @property + def ckan(self) -> Column[bool]: + return self._cte.c['ckan'] + + @property + def muckrock(self) -> Column[bool]: + return self._cte.c['muckrock'] + + @property + def homepage(self) -> Column[bool]: + return self._cte.c['homepage'] + + @property + def nlp_location(self) -> Column[bool]: + return self._cte.c['nlp_location'] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py index 6ff2841f..96a322cb 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py @@ -1,6 +1,7 @@ from sqlalchemy import select, ColumnElement, Integer, func -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import ELIGIBLE_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType @@ -12,11 +13,13 @@ def sum_count(col: ColumnElement[bool], subtask_type: AutoAgencyIDSubtaskType) - 0, ).label(subtask_type.value) +container = EligibleContainer() + ELIGIBLE_COUNTS_QUERY = ( select( - sum_count(ELIGIBLE_CTE.c.ckan, AutoAgencyIDSubtaskType.CKAN), - sum_count(ELIGIBLE_CTE.c.muckrock, AutoAgencyIDSubtaskType.MUCKROCK), - sum_count(ELIGIBLE_CTE.c.homepage, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH), - sum_count(ELIGIBLE_CTE.c.nlp_location, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH), + sum_count(container.ckan, AutoAgencyIDSubtaskType.CKAN), + sum_count(container.muckrock, AutoAgencyIDSubtaskType.MUCKROCK), + sum_count(container.homepage, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH), + sum_count(container.nlp_location, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH), ) ) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 19606b84..0e0d5a39 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,15 +1,14 @@ from datetime import date -from typing import Optional, Any +from typing import Any from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor_.models.response import \ - NLPLocationMatchResponse from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ + SearchAgencyByLocationOuterResponse from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @@ -52,7 +51,11 @@ async def search_agency_by_location( ) response_info: ResponseInfo = await self.access_manager.make_request(request_info) - raise NotImplementedError + outer_response = SearchAgencyByLocationOuterResponse( + **response_info.data + ) + + return outer_response.responses async def match_agency( self, diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py index d894b2d8..54dcb5cb 100644 --- a/src/external/pdap/dtos/search_agency_by_location/response.py +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -1,6 +1,12 @@ from pydantic import BaseModel, Field +class SearchAgencyByLocationAgencyInfo(BaseModel): + agency_id: int + similarity: float = Field(ge=0, le=1) + class SearchAgencyByLocationResponse(BaseModel): request_id: int - agency_id: int - similarity: float = Field(ge=0, le=1) \ No newline at end of file + results: list[SearchAgencyByLocationAgencyInfo] + +class SearchAgencyByLocationOuterResponse(BaseModel): + responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py index b6a08ee8..7feb6d61 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py @@ -1,9 +1,11 @@ -from unittest.mock import create_autospec, AsyncMock +from unittest.mock import create_autospec import pytest from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient @@ -19,7 +21,8 @@ def operator( loader=AgencyIdentificationSubtaskLoader( pdap_client=create_autospec(PDAPClient), muckrock_api_interface=create_autospec(MuckrockAPIInterface), - adb_client=adb_client_test + adb_client=adb_client_test, + nlp_processor=create_autospec(NLPProcessor) ), ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py new file mode 100644 index 00000000..766a7ca5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py @@ -0,0 +1,15 @@ +import pytest_asyncio + +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest_asyncio.fixture +async def url_ids( + db_data_creator: DBDataCreator, +) -> list[int]: + # Create 2 URLs with compressed HTML + url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_ids: list[int] = [url.url_id for url in url_mappings] + await db_data_creator.html_data(url_ids=url_ids) + return url_ids diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py new file mode 100644 index 00000000..e13ee7a6 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py @@ -0,0 +1,29 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +PATCH_ROOT = ( + "src.core.tasks.url.operators.agency_identification.subtasks." + + "impl.nlp_location_match_.core.AgencyIDSubtaskInternalProcessor" +) + +@pytest.mark.asyncio +async def test_nlp_location_match( + operator: AgencyIdentificationTaskOperator, + url_ids: list[int], + monkeypatch +): + # Confirm operator meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + mock_internal_processor = AsyncMock(spec=AgencyIDSubtaskInternalProcessor) + monkeypatch.setattr(PATCH_ROOT, mock_internal_processor) + +# + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_no_results.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_no_results.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_results.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_results.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py deleted file mode 100644 index 75eacd59..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_core.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_nlp_location_match( - db_data_creator: DBDataCreator, - operator: AgencyIdentificationTaskOperator -): - - # Create 2 URLs with compressed HTML - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) - url_ids: list[int] = [url.url_id for url in url_mappings] - await db_data_creator.html_data(url_ids=url_ids) - - # Confirm operator meets prerequisites - assert await operator.meets_task_prerequisites() - assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH - - # raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/survey/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py new file mode 100644 index 00000000..8ace042e --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py @@ -0,0 +1,49 @@ +import pytest + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from tests.helpers.data_creator.core import DBDataCreator + +@pytest.mark.asyncio +async def test_survey_flag( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, + monkeypatch +): + """ + Test that survey correctly disables Subtask flags + when the environment variable is set to disable that subtask + """ + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency_name": "Test Agency" + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.CKAN + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is CKAN + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.CKAN + + # Set flag to disable CKAN Subtask + monkeypatch.setenv( + "AGENCY_ID_CKAN_FLAG", "0" + ) + + # Confirm prerequisite no longer met. + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 045236f9..52a17b5e 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -4,10 +4,11 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -22,4 +23,5 @@ def loader() -> URLTaskOperatorLoader: pdap_client=AsyncMock(spec=PDAPClient), muckrock_api_interface=AsyncMock(spec=MuckrockAPIInterface), hf_inference_client=AsyncMock(spec=HuggingFaceInferenceClient), + nlp_processor=AsyncMock(spec=NLPProcessor) ) \ No newline at end of file diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py new file mode 100644 index 00000000..c38a52b1 --- /dev/null +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -0,0 +1,22 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor + +SAMPLE_HTML: str = """ + +I live in Pittsburgh, Allegheny, Pennsylvania. + +""" + +@pytest.mark.asyncio +async def test_nlp_processor_happy_path(): + nlp_processor = NLPProcessor() + response = nlp_processor.parse_for_locations(SAMPLE_HTML) + print(response) + +@pytest.mark.asyncio +async def test_nlp_processor_empty_html(): + nlp_processor = NLPProcessor() + response = nlp_processor.parse_for_locations("") + print(response) \ No newline at end of file diff --git a/uv.lock b/uv.lock index 08a5ddf8..3dffe619 100644 --- a/uv.lock +++ b/uv.lock @@ -508,6 +508,7 @@ dependencies = [ { name = "marshmallow" }, { name = "openai" }, { name = "pdap-access-manager" }, + { name = "pip" }, { name = "playwright" }, { name = "psycopg", extra = ["binary"] }, { name = "psycopg2-binary" }, @@ -558,6 +559,7 @@ requires-dist = [ { name = "marshmallow", specifier = "~=3.23.2" }, { name = "openai", specifier = "~=1.60.1" }, { name = "pdap-access-manager", specifier = "==0.3.6" }, + { name = "pip", specifier = ">=25.2" }, { name = "playwright", specifier = "~=1.49.1" }, { name = "psycopg", extras = ["binary"], specifier = "~=3.1.20" }, { name = "psycopg2-binary", specifier = "~=2.9.6" }, @@ -1641,6 +1643,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/23/e98758924d1b3aac11a626268eabf7f3cf177e7837c28d47bf84c64532d0/pendulum-3.1.0-py3-none-any.whl", hash = "sha256:f9178c2a8e291758ade1e8dd6371b1d26d08371b4c7730a6e9a3ef8b16ebae0f", size = 111799, upload_time = "2025-04-19T14:02:34.739Z" }, ] +[[package]] +name = "pip" +version = "25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload_time = "2025-07-30T21:50:15.401Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload_time = "2025-07-30T21:50:13.323Z" }, +] + [[package]] name = "playwright" version = "1.49.1" From e3af970765b64716531bd9a6a2ba044b0867f8a2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 9 Sep 2025 08:25:17 -0400 Subject: [PATCH 29/33] Continue draft --- .../subtasks/impl/nlp_location_match_/core.py | 3 +- .../impl/nlp_location_match_/query.py | 2 +- .../agency_identification/subtasks/loader.py | 2 +- .../subtasks/models/run_info.py | 6 +- .../subtasks/templates/subtask.py | 2 +- .../end_to_end/test_core.py | 101 ++++++++++++++++-- .../internal_processor/conftest.py | 18 ++++ .../__init__.py} | 0 .../convert_search_agency_responses/params.py | 2 + .../test_core.py} | 0 .../extract_search_params/__init__.py | 0 .../extract_search_params/model.py | 4 + .../extract_search_params/test_core.py | 0 .../internal_processor/test_core.py | 6 ++ 14 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/{end_to_end/test_no_results.py => internal_processor/convert_search_agency_responses/__init__.py} (100%) create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/{end_to_end/test_results.py => internal_processor/convert_search_agency_responses/test_core.py} (100%) create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py index 6aeec35e..0c172e5d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -38,6 +38,7 @@ async def inner_logic(self) -> None: await self.run_subtask_iteration(inputs) async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + self.linked_urls.extend([input_.url_id for input_ in inputs]) subtask_data_list: list[AutoAgencyIDSubtaskData] = await self._process_inputs(inputs) await self._upload_subtask_data(subtask_data_list) @@ -52,5 +53,5 @@ async def _process_inputs( async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: return await self.adb_client.run_query_builder( - query_builder=GetNLPLocationMatchSubtaskInputQueryBuilder(), + GetNLPLocationMatchSubtaskInputQueryBuilder(), ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py index db82b22d..32311bd1 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py @@ -40,7 +40,7 @@ async def run( mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) inputs: list[NLPLocationMatchSubtaskInput] = [ NLPLocationMatchSubtaskInput( - url_id=mapping["url_id"], + url_id=mapping["id"], html=decompress_html(mapping["compressed_html"]), ) for mapping in mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 850650c5..5dab9608 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -71,7 +71,7 @@ async def load_subtask( case AutoAgencyIDSubtaskType.CKAN: return self._load_ckan_subtask(task_id) case AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: - return self._load_muckrock_subtask(task_id) + return self._load_nlp_location_match_subtask(task_id) case AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: return self._load_homepage_match_subtask(task_id) raise ValueError(f"Unknown subtask type: {subtask_type}") diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py index b2ee3e28..524830e3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py @@ -7,4 +7,8 @@ class AgencyIDSubtaskRunInfo(BaseModel): @property def is_success(self) -> bool: - return self.error is None \ No newline at end of file + return self.error is None + + @property + def has_linked_urls(self) -> bool: + return len(self.linked_url_ids) > 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py index c4cc6226..b4e4b018 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -18,7 +18,7 @@ def __init__( ) -> None: self.adb_client: AsyncDatabaseClient = adb_client self.task_id: int = task_id - self.linked_urls: list[int] | None = None + self.linked_urls: list[int] = [] async def run(self) -> AgencyIDSubtaskRunInfo: try: diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py index e13ee7a6..2c3ed419 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py @@ -1,20 +1,36 @@ -from unittest.mock import AsyncMock +from unittest.mock import AsyncMock, MagicMock import pytest +from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator PATCH_ROOT = ( "src.core.tasks.url.operators.agency_identification.subtasks." + - "impl.nlp_location_match_.core.AgencyIDSubtaskInternalProcessor" + "impl.nlp_location_match_.core.AgencyIDSubtaskInternalProcessor.process" ) + + @pytest.mark.asyncio async def test_nlp_location_match( operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, url_ids: list[int], monkeypatch ): @@ -22,8 +38,81 @@ async def test_nlp_location_match( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH - mock_internal_processor = AsyncMock(spec=AgencyIDSubtaskInternalProcessor) - monkeypatch.setattr(PATCH_ROOT, mock_internal_processor) + happy_path_url_id: int = url_ids[0] + error_url_id: int = url_ids[1] + + agency_ids: list[int] = await db_data_creator.create_agencies(count=2) + agency_id_25: int = agency_ids[0] + agency_id_75: int = agency_ids[1] + + async def mock_process_response( + self: AgencyIDSubtaskInternalProcessor, + inputs: list[NLPLocationMatchSubtaskInput], + ) -> list[AutoAgencyIDSubtaskData]: + response = [ + AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=self._task_id, + url_id=happy_path_url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True, + ), + suggestions=[ + AgencySuggestion( + agency_id=agency_id_25, + confidence=25 + ), + AgencySuggestion( + agency_id=agency_id_75, + confidence=75 + ) + ] + ), + AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=self._task_id, + url_id=error_url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=False, + ), + suggestions=[], + error="Test error" + ) + ] + return response + + monkeypatch.setattr(AgencyIDSubtaskInternalProcessor, "process", mock_process_response) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + adb_client: AsyncDatabaseClient = operator.adb_client + # Confirm two URLs linked to the task + task_links: list[LinkTaskURL] = await adb_client.get_all(LinkTaskURL) + assert len(task_links) == 2 + assert {task_link.url_id for task_link in task_links} == set(url_ids) + assert {task_link.task_id for task_link in task_links} == {operator._task_id} + + # Confirm two subtasks were created + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 2 + assert {subtask.url_id for subtask in subtasks} == set(url_ids) + assert {subtask.task_id for subtask in subtasks} == {operator._task_id} + assert {subtask.type for subtask in subtasks} == {AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH} + assert {subtask.agencies_found for subtask in subtasks} == {True, False} + + + # Confirm one URL error info + error_infos: list[URLErrorInfo] = await adb_client.get_all(URLErrorInfo) + assert len(error_infos) == 1 + assert error_infos[0].task_id == operator._task_id + assert error_infos[0].url_id == error_url_id + assert error_infos[0].error == "Test error" + + # Confirm two suggestions for happy path URL id + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 2 + # Confirm expected agency ids + assert {suggestion.agency_id for suggestion in suggestions} == set(agency_ids) + # Confirm both have the expected confidence values + assert {suggestion.confidence for suggestion in suggestions} == {25, 75} -# - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py new file mode 100644 index 00000000..fa70c786 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py @@ -0,0 +1,18 @@ +from unittest.mock import AsyncMock + +import pytest_asyncio + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.external.pdap.client import PDAPClient + + +@pytest_asyncio.fixture +async def internal_processor() -> AgencyIDSubtaskInternalProcessor: + return AgencyIDSubtaskInternalProcessor( + nlp_processor=AsyncMock(spec=NLPProcessor), + pdap_client=AsyncMock(spec=PDAPClient), + task_id=1 + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_no_results.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_no_results.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/__init__.py diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py new file mode 100644 index 00000000..139597f9 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py @@ -0,0 +1,2 @@ + + diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_results.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_results.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py new file mode 100644 index 00000000..1efade83 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py @@ -0,0 +1,4 @@ +from pydantic import BaseModel + + +class TestExtractSearchParamsTestModel(BaseModel): diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py new file mode 100644 index 00000000..a2b03ae6 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.mark.asyncio +async def test_core(): + From 008ab745e298df64990c335e62b329bfa9468ad7 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Sep 2025 20:05:23 -0400 Subject: [PATCH 30/33] Continue Draft --- .../nlp_location_match_/processor/convert.py | 2 +- .../subtasks/models/suggestion.py | 4 +- .../search_agency_by_location/response.py | 2 +- .../convert_search_agency_responses/params.py | 7 ++ .../test_core.py | 104 ++++++++++++++++++ 5 files changed, 115 insertions(+), 4 deletions(-) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py index 3e0924ba..a18d1d81 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -71,7 +71,7 @@ def convert_search_agency_response_to_subtask_data( task_id=task_id, url_id=url_id, type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=len(suggestions) > 0 + agencies_found=True ) return AutoAgencyIDSubtaskData( pydantic_model=pydantic_model, diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py index 5dbc62ad..669c498c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py @@ -1,6 +1,6 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field class AgencySuggestion(BaseModel): agency_id: int - confidence: int \ No newline at end of file + confidence: int = Field(ge=0, le=100) \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py index 54dcb5cb..92242b5a 100644 --- a/src/external/pdap/dtos/search_agency_by_location/response.py +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -6,7 +6,7 @@ class SearchAgencyByLocationAgencyInfo(BaseModel): class SearchAgencyByLocationResponse(BaseModel): request_id: int - results: list[SearchAgencyByLocationAgencyInfo] + results: list[SearchAgencyByLocationAgencyInfo] = Field(min_length=1) class SearchAgencyByLocationOuterResponse(BaseModel): responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py index 139597f9..f0a27b97 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py @@ -1,2 +1,9 @@ +from pydantic import BaseModel +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + +class ConvertSearchAgencyResponsesTestParams(BaseModel): + search_agency_by_location_responses: list[SearchAgencyByLocationResponse] + expected_subtask_data: AutoAgencyIDSubtaskData diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py index e69de29b..fe5f5265 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py @@ -0,0 +1,104 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ + URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ + SearchAgencyByLocationAgencyInfo +from tests.automated.integration.tasks.url.impl.agency_identification.subtasks.nlp_location_match.internal_processor.convert_search_agency_responses.params import \ + ConvertSearchAgencyResponsesTestParams + +PARAMETERS = [ + ConvertSearchAgencyResponsesTestParams( + search_agency_by_location_responses=[ + SearchAgencyByLocationResponse( + request_id=1, + results=[ + SearchAgencyByLocationAgencyInfo( + agency_id=1, + similarity=1.0, + ), + SearchAgencyByLocationAgencyInfo( + agency_id=2, + similarity=0.5, + ), + ] + ), + SearchAgencyByLocationResponse( + request_id=2, + results=[ + SearchAgencyByLocationAgencyInfo( + agency_id=3, + similarity=0.75, + ), + ] + ) + ], + expected_subtask_data=AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=1, + url_id=1, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True, + ), + suggestions=[ + AgencySuggestion( + agency_id=1, + confidence=100, + ), + AgencySuggestion( + agency_id=2, + confidence=50, + ), + AgencySuggestion( + agency_id=3, + confidence=75, + ) + ] + ) + ), + ConvertSearchAgencyResponsesTestParams( + search_agency_by_location_responses=[ + SearchAgencyByLocationResponse( + request_id=2, + results=[ + SearchAgencyByLocationAgencyInfo( + agency_id=1, + similarity=1.0, + ), + SearchAgencyByLocationAgencyInfo( + agency_id=2, + similarity=0.5, + ), + ] + ) + ], + expected_subtask_data=AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=1, + url_id=2, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True, + ), + suggestions=[ + AgencySuggestion( + agency_id=1, + confidence=100, + ), + AgencySuggestion( + agency_id=2, + confidence=50, + ) + ] + ) + ), +] + +@pytest.mark.asyncio +async def test_params() -> None: + mapper = URLRequestIDMapper() + mapper.add_mapping(request_id=1, url_id=1) + mapper.add_mapping(request_id=2, url_id=1) \ No newline at end of file From f07b388647bdd89dfcbe3df83c87960bb4860ae6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 10 Sep 2025 21:10:14 -0400 Subject: [PATCH 31/33] Continue draft --- .../nlp_location_match_/processor/convert.py | 46 +++++++---- .../nlp_location_match_/processor/core.py | 78 ++++++++++++++++--- .../nlp_location_match_/processor/extract.py | 12 +++ .../processor/models/__init__.py | 0 .../processor/models/mappings/__init__.py | 0 .../models/mappings/url_id_nlp_response.py | 9 +++ .../models/mappings/url_id_search_params.py | 8 ++ .../convert_nlp_response/__init__.py | 0 .../test_state_only.py} | 4 +- .../test_core.py | 29 ++++++- .../{ => extract_search_params}/conftest.py | 0 .../extract_search_params/test_core.py | 41 ++++++++++ 12 files changed, 195 insertions(+), 32 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/__init__.py rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/{test_core.py => convert_nlp_response/test_state_only.py} (60%) rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/{ => extract_search_params}/conftest.py (100%) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py index a18d1d81..c0736b06 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -1,3 +1,4 @@ +from collections import defaultdict from math import ceil from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter @@ -40,22 +41,33 @@ def convert_search_agency_responses_to_subtask_data_list( task_id: int ) -> list[AutoAgencyIDSubtaskData]: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + url_id_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) + + # First, extract agency suggestions for URL for response in responses: + suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions(response) url_id: int = mapper.get_url_id_by_request_id(response.request_id) - subtask_data: AutoAgencyIDSubtaskData = \ - convert_search_agency_response_to_subtask_data( - response=response, - task_id=task_id, - url_id=url_id, - ) + url_id_to_suggestions[url_id].extend(suggestions) + + # Then, construct subtask data and + for url_id, suggestions in url_id_to_suggestions.items(): + pydantic_model: URLAutoAgencyIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( + url_id=url_id, + task_id=task_id + ) + + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=suggestions + ) + subtask_data_list.append(subtask_data) return subtask_data_list -def convert_search_agency_response_to_subtask_data( - url_id: int, + +def _convert_search_agency_response_to_agency_suggestions( response: SearchAgencyByLocationResponse, - task_id: int -) -> AutoAgencyIDSubtaskData: +) -> list[AgencySuggestion]: suggestions: list[AgencySuggestion] = [] for result in response.results: agency_id: int = result.agency_id @@ -66,14 +78,18 @@ def convert_search_agency_response_to_subtask_data( confidence=confidence, ) suggestions.append(suggestion) + return suggestions + + + +def convert_search_agency_response_to_subtask_pydantic( + url_id: int, + task_id: int +) -> URLAutoAgencyIDSubtaskPydantic: - pydantic_model = URLAutoAgencyIDSubtaskPydantic( + return URLAutoAgencyIDSubtaskPydantic( task_id=task_id, url_id=url_id, type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, agencies_found=True ) - return AutoAgencyIDSubtaskData( - pydantic_model=pydantic_model, - suggestions=suggestions - ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py index f283ca7b..b1a6974d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -1,10 +1,16 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ convert_nlp_response_to_search_agency_by_location_params, convert_search_agency_responses_to_subtask_data_list from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.extract import \ + _extract_all_search_params from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ URLRequestIDMapper from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ @@ -59,22 +65,70 @@ def _extract_search_params( - self._mapper - self._counter """ - all_search_params: list[SearchAgencyByLocationParams] = [] + + url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ + self._match_urls_to_nlp_responses(inputs) + + url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ + self._match_urls_to_search_params(url_to_nlp_mappings) + + all_search_params: list[SearchAgencyByLocationParams] = \ + _extract_all_search_params(url_to_search_params_mappings) + + self._add_all_url_search_param_mappings(url_to_search_params_mappings) + + return all_search_params + + def _add_all_url_search_param_mappings( + self, + url_to_search_params_mappings: list[URLToSearchParamsMapping] + ) -> None: + """ + Modifies: + - self._mapper + """ + for mapping in url_to_search_params_mappings: + for search_param in mapping.search_params: + self._mapper.add_mapping( + request_id=search_param.request_id, + url_id=mapping.url_id, + ) + + def _match_urls_to_search_params( + self, + url_to_nlp_mappings: list[URLToNLPResponseMapping] + ) -> list[URLToSearchParamsMapping]: + """ + Modifies: + - self._counter + """ + url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] + for mapping in url_to_nlp_mappings: + search_params: list[SearchAgencyByLocationParams] = \ + convert_nlp_response_to_search_agency_by_location_params( + counter=self._counter, + nlp_response=mapping.nlp_response, + ) + mapping = URLToSearchParamsMapping( + url_id=mapping.url_id, + search_params=search_params, + ) + url_to_search_params_mappings.append(mapping) + return url_to_search_params_mappings + + def _match_urls_to_nlp_responses( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[URLToNLPResponseMapping]: + url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] for input_ in inputs: nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) - search_params: list[ - SearchAgencyByLocationParams] = convert_nlp_response_to_search_agency_by_location_params( - counter=self._counter, + mapping = URLToNLPResponseMapping( + url_id=input_.url_id, nlp_response=nlp_response, ) - for search_param in search_params: - self._mapper.add_mapping( - request_id=search_param.request_id, - url_id=input_.url_id, - ) - search_params.append(search_param) - all_search_params.extend(search_params) - return all_search_params + url_to_nlp_mappings.append(mapping) + return url_to_nlp_mappings def _get_location_match( self, diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py new file mode 100644 index 00000000..053f4fb5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py @@ -0,0 +1,12 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +def _extract_all_search_params( + url_to_search_params_mappings: list[URLToSearchParamsMapping] +) -> list[SearchAgencyByLocationParams]: + all_search_params: list[SearchAgencyByLocationParams] = [] + for mapping in url_to_search_params_mappings: + all_search_params.extend(mapping.search_params) + return all_search_params diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py new file mode 100644 index 00000000..7bb7e701 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse + + +class URLToNLPResponseMapping(BaseModel): + url_id: int + nlp_response: NLPLocationMatchResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py new file mode 100644 index 00000000..07287092 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +class URLToSearchParamsMapping(BaseModel): + url_id: int + search_params: list[SearchAgencyByLocationParams] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py similarity index 60% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py index a2b03ae6..cff69bd5 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py @@ -2,5 +2,5 @@ @pytest.mark.asyncio -async def test_core(): - +async def test_core( +): \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py index fe5f5265..1d36d120 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py @@ -1,5 +1,7 @@ import pytest +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ + convert_search_agency_responses_to_subtask_data_list from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ URLRequestIDMapper from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData @@ -63,7 +65,7 @@ ConvertSearchAgencyResponsesTestParams( search_agency_by_location_responses=[ SearchAgencyByLocationResponse( - request_id=2, + request_id=3, results=[ SearchAgencyByLocationAgencyInfo( agency_id=1, @@ -98,7 +100,28 @@ ] @pytest.mark.asyncio -async def test_params() -> None: +async def test_params( +) -> None: mapper = URLRequestIDMapper() mapper.add_mapping(request_id=1, url_id=1) - mapper.add_mapping(request_id=2, url_id=1) \ No newline at end of file + mapper.add_mapping(request_id=2, url_id=1) + mapper.add_mapping(request_id=3, url_id=2) + + search_responses: list[SearchAgencyByLocationResponse] = [] + for param in PARAMETERS: + search_responses.extend(param.search_agency_by_location_responses) + + subtask_data_list: list[AutoAgencyIDSubtaskData] = \ + convert_search_agency_responses_to_subtask_data_list( + responses=search_responses, + task_id=1, + mapper=mapper, + ) + + assert len(subtask_data_list) == len(PARAMETERS) + + for subtask_data, param in zip(subtask_data_list, PARAMETERS): + expected_subtask_data: AutoAgencyIDSubtaskData = param.expected_subtask_data + assert subtask_data.pydantic_model == expected_subtask_data.pydantic_model + assert subtask_data.suggestions == expected_subtask_data.suggestions + diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/conftest.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/conftest.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/conftest.py diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py index e69de29b..5779b799 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py @@ -0,0 +1,41 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + + +@pytest.mark.asyncio +async def test_core( + internal_processor: AgencyIDSubtaskInternalProcessor +): + # Define NLPLocationMatchSubtaskInputs + inputs: list[NLPLocationMatchSubtaskInput] = [ + NLPLocationMatchSubtaskInput( + url_id=1, + html="State and multiple locations" + ), + NLPLocationMatchSubtaskInput( + url_id=2, + html="Single location" + ), + NLPLocationMatchSubtaskInput( + url_id=3, + html="No location" + ) + ] + + + # Set _get_location_match responses + + + # Run _extract_search_params + + + # Validate results + + # Validate counter + + # Validate mapper \ No newline at end of file From dd21a9cb6ac26d6bdcdd8350d6a0d413a4970f70 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 11 Sep 2025 09:02:28 -0400 Subject: [PATCH 32/33] Continue Draft --- .../nlp_location_match_/processor/convert.py | 15 +++++++ .../nlp_location_match_/processor/core.py | 26 ++++++++---- .../models/mappings/url_id_search_params.py | 6 ++- .../convert_nlp_response/test_state_only.py | 6 --- .../extract_search_params/__init__.py | 0 .../extract_search_params/model.py | 4 -- .../extract_search_params/test_core.py | 41 ------------------- .../__init__.py | 0 .../conftest.py | 10 ++--- .../match_urls_to_search_params/test_empty.py | 14 +++++++ .../test_no_state_any_locations.py | 14 +++++++ .../test_state_multiple_locations.py | 14 +++++++ .../test_state_no_locations.py | 14 +++++++ .../test_state_one_location.py | 14 +++++++ 14 files changed, 113 insertions(+), 65 deletions(-) delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/{convert_nlp_response => match_urls_to_search_params}/__init__.py (100%) rename tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/{extract_search_params => match_urls_to_search_params}/conftest.py (74%) create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py index c0736b06..7f0d57b7 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -4,6 +4,8 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ NLPLocationMatchResponse from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData @@ -80,6 +82,19 @@ def _convert_search_agency_response_to_agency_suggestions( suggestions.append(suggestion) return suggestions +def convert_empty_url_search_param_mappings_to_subtask_data_list( + mappings: list[URLToSearchParamsMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + results: list[AutoAgencyIDSubtaskData] = [] + for mapping in mappings: + if not mapping.empty: + raise ValueError("URLToSearchParamsMapping expected empty in conversion function.") + subtask_data = AutoAgencyIDSubtaskData( + + ) + + def convert_search_agency_response_to_subtask_pydantic( diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py index b1a6974d..4c17a166 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -40,10 +40,25 @@ async def process( inputs: list[NLPLocationMatchSubtaskInput] ) -> list[AutoAgencyIDSubtaskData]: - search_params: list[SearchAgencyByLocationParams] = self._extract_search_params( + url_search_param_mappings: list[URLToSearchParamsMapping] = self._extract_search_params( inputs=inputs ) + # Filter out empty params + url_search_param_mappings_empty: list[URLToSearchParamsMapping] = \ + [mapping for mapping in url_search_param_mappings if mapping.is_empty] + + # Convert empty params to subtask data with empty agencies + subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_empty_url_search_param_mappings_to_subtask_data_list( + responses=[], + task_id=self._task_id, + mapper=self._mapper, + ) + + + + search_responses: list[SearchAgencyByLocationResponse] = \ await self._get_pdap_info(search_params) @@ -59,7 +74,7 @@ async def process( def _extract_search_params( self, inputs: list[NLPLocationMatchSubtaskInput] - ) -> list[SearchAgencyByLocationParams]: + ) -> list[URLToSearchParamsMapping]: """ Modifies: - self._mapper @@ -72,12 +87,7 @@ def _extract_search_params( url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ self._match_urls_to_search_params(url_to_nlp_mappings) - all_search_params: list[SearchAgencyByLocationParams] = \ - _extract_all_search_params(url_to_search_params_mappings) - - self._add_all_url_search_param_mappings(url_to_search_params_mappings) - - return all_search_params + return url_to_search_params_mappings def _add_all_url_search_param_mappings( self, diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py index 07287092..5ab9deac 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py @@ -5,4 +5,8 @@ class URLToSearchParamsMapping(BaseModel): url_id: int - search_params: list[SearchAgencyByLocationParams] \ No newline at end of file + search_params: list[SearchAgencyByLocationParams] + + @property + def is_empty(self) -> bool: + return len(self.search_params) == 0 \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py deleted file mode 100644 index cff69bd5..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/test_state_only.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_core( -): \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py deleted file mode 100644 index 1efade83..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/model.py +++ /dev/null @@ -1,4 +0,0 @@ -from pydantic import BaseModel - - -class TestExtractSearchParamsTestModel(BaseModel): diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py deleted file mode 100644 index 5779b799..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/test_core.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ - NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - - -@pytest.mark.asyncio -async def test_core( - internal_processor: AgencyIDSubtaskInternalProcessor -): - # Define NLPLocationMatchSubtaskInputs - inputs: list[NLPLocationMatchSubtaskInput] = [ - NLPLocationMatchSubtaskInput( - url_id=1, - html="State and multiple locations" - ), - NLPLocationMatchSubtaskInput( - url_id=2, - html="Single location" - ), - NLPLocationMatchSubtaskInput( - url_id=3, - html="No location" - ) - ] - - - # Set _get_location_match responses - - - # Run _extract_search_params - - - # Validate results - - # Validate counter - - # Validate mapper \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_nlp_response/__init__.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py similarity index 74% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/conftest.py rename to tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py index fa70c786..2abee544 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/extract_search_params/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py @@ -1,6 +1,6 @@ from unittest.mock import AsyncMock -import pytest_asyncio +import pytest from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ AgencyIDSubtaskInternalProcessor @@ -9,10 +9,10 @@ from src.external.pdap.client import PDAPClient -@pytest_asyncio.fixture -async def internal_processor() -> AgencyIDSubtaskInternalProcessor: +@pytest.fixture +def internal_processor() -> AgencyIDSubtaskInternalProcessor: return AgencyIDSubtaskInternalProcessor( nlp_processor=AsyncMock(spec=NLPProcessor), - pdap_client=AsyncMock(spec=PDAPClient), + pdap_client=AsyncMock(PDAPClient), task_id=1 - ) \ No newline at end of file + ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py new file mode 100644 index 00000000..01899f30 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_empty( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has no US State or locations, + that result is not returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py new file mode 100644 index 00000000..5fbbc6b5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_no_state_any_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has no US State and any locations + that the result is not returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py new file mode 100644 index 00000000..6e7aef6a --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_multiple_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and multiple locations + then multiple results are returned with separate request ids + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py new file mode 100644 index 00000000..c0b1cef4 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_no_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and no locations + then no result is returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py new file mode 100644 index 00000000..7b4ef303 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_one_location( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and one locatio + then one result is returned + """ \ No newline at end of file From 52abc9cc60fb57b7acdec5b2557c6e89f848d5dd Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 12 Sep 2025 11:01:32 -0400 Subject: [PATCH 33/33] Finish draft --- src/core/tasks/handler.py | 5 +- src/core/tasks/scheduled/manager.py | 11 +- src/core/tasks/url/loader.py | 3 +- .../operators/agency_identification/core.py | 1 + .../queries/ctes/whitelisted_root_urls.py | 5 +- .../impl/nlp_location_match_/constants.py | 4 +- .../processor/constants.py | 3 + .../nlp_location_match_/processor/convert.py | 124 ++++++++++---- .../nlp_location_match_/processor/core.py | 156 ++++++++---------- .../nlp_location_match_/processor/filter.py | 59 +++++++ .../models/mappings/url_id_search_response.py | 8 + .../processor/models/subsets}/__init__.py | 0 .../processor/models/subsets/nlp_responses.py | 9 + .../processor/nlp/constants.py | 17 +- .../nlp_location_match_/processor/nlp/core.py | 19 +++ .../processor/nlp/models/response.py | 7 +- .../processor/nlp/preprocess.py | 20 +++ .../subtasks/templates/subtask.py | 5 +- .../models/impl/link/batch_url/sqlalchemy.py | 2 - src/db/models/impl/url/core/sqlalchemy.py | 2 +- .../agency/suggestion/sqlalchemy.py | 2 +- .../dtos/search_agency_by_location/params.py | 9 +- .../convert_search_agency_responses/params.py | 9 - .../test_core.py | 127 -------------- .../test_nlp_response_valid.py | 57 +++++++ 25 files changed, 387 insertions(+), 277 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py rename {tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses => src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 7f488594..6ddca6eb 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -50,8 +50,11 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): # task_id=run_info.task_id, error=run_info.message ) + msg: str = f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error: {run_info.message}" + print(msg) self.discord_poster.post_to_discord( - message=f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error.") + message=msg + ) async def link_urls_to_task(self, task_id: int, url_ids: list[int]): await self.adb_client.link_urls_to_task( diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index 86dfff70..87cb5a27 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,6 +1,3 @@ -from apscheduler.job import Job -from apscheduler.schedulers.asyncio import AsyncIOScheduler - from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.mixins.link_urls import LinkURLsMixin @@ -39,15 +36,19 @@ async def add_scheduled_tasks(self): self._registry """ entries: list[ScheduledTaskEntry] = await self._loader.load_entries() - for idx, entry in enumerate(entries): + enabled_entries: list[ScheduledTaskEntry] = [] + for entry in entries: if not entry.enabled: print(f"{entry.operator.task_type.value} is disabled. Skipping add to scheduler.") continue + enabled_entries.append(entry) + initial_lag: int = 1 + for idx, entry in enumerate(enabled_entries): await self._registry.add_job( func=self.run_task, entry=entry, - minute_lag=idx + minute_lag=idx + initial_lag ) def shutdown(self): diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 91b52f50..600ea1d2 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -83,7 +83,8 @@ async def _get_agency_identification_task_operator(self) -> URLTaskEntry: loader=AgencyIdentificationSubtaskLoader( pdap_client=self.pdap_client, muckrock_api_interface=self.muckrock_api_interface, - adb_client=self.adb_client + adb_client=self.adb_client, + nlp_processor=self.nlp_processor ) ) return URLTaskEntry( diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index d4f5f87c..92ece84e 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -65,6 +65,7 @@ async def run_subtask( async def inner_task_logic(self) -> None: subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) + print(f"Running Subtask: {self._subtask.value}") run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) await self.link_urls_to_task(run_info.linked_url_ids) if not run_info.is_success: diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py index 66f7c777..1af8f46c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -8,7 +8,6 @@ from src.db.models.impl.url.core.sqlalchemy import URL WHITELISTED_ROOT_URLS_CTE: CTE = ( - # TODO: Check for no fan-out select( URL.id ) @@ -33,7 +32,9 @@ ) .where( # The connected URLs must be Meta URLs - FlagURLValidated.type == URLValidatedType.META_URL + FlagURLValidated.type == URLValidatedType.META_URL, + # Root URL can't be "https://catalog.data.gov" + URL.url != "https://catalog.data.gov" ) .group_by( URL.id diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py index fb8f22ba..b8b4ce4d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py @@ -1,4 +1,4 @@ -ITERATIONS_PER_SUBTASK = 1 -NUMBER_OF_ENTRIES_PER_ITERATION = 10 \ No newline at end of file +ITERATIONS_PER_SUBTASK = 2 +NUMBER_OF_ENTRIES_PER_ITERATION = 20 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py new file mode 100644 index 00000000..cc16da9f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py @@ -0,0 +1,3 @@ + + +MAX_NLP_CONFIDENCE: int = 90 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py index 7f0d57b7..103580da 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -1,11 +1,15 @@ -from collections import defaultdict from math import ceil -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ - URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.constants import \ + MAX_NLP_CONFIDENCE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import \ + RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ NLPLocationMatchResponse from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData @@ -22,14 +26,13 @@ def convert_nlp_response_to_search_agency_by_location_params( ) -> list[SearchAgencyByLocationParams]: params: list[SearchAgencyByLocationParams] = [] for location in nlp_response.locations: - if nlp_response.us_state is not None: - query: str = f"{location}, {nlp_response.us_state.name}" - else: - query: str = location + if nlp_response.us_state is None: + raise ValueError("US State is None; cannot convert NLP response to search agency by location params") request_id: int = counter.next() param = SearchAgencyByLocationParams( request_id=request_id, - query=query + query=location, + iso=nlp_response.us_state.iso, ) params.append(param) @@ -38,63 +41,93 @@ def convert_nlp_response_to_search_agency_by_location_params( def convert_search_agency_responses_to_subtask_data_list( - mapper: URLRequestIDMapper, - responses: list[SearchAgencyByLocationResponse], + mappings: list[URLToSearchResponseMapping], task_id: int ) -> list[AutoAgencyIDSubtaskData]: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] - url_id_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) # First, extract agency suggestions for URL - for response in responses: - suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions(response) - url_id: int = mapper.get_url_id_by_request_id(response.request_id) - url_id_to_suggestions[url_id].extend(suggestions) - - # Then, construct subtask data and - for url_id, suggestions in url_id_to_suggestions.items(): + for mapping in mappings: + url_id: int = mapping.url_id + search_responses: list[SearchAgencyByLocationResponse] = mapping.search_responses + suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions( + search_responses + ) pydantic_model: URLAutoAgencyIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( url_id=url_id, task_id=task_id ) - subtask_data = AutoAgencyIDSubtaskData( pydantic_model=pydantic_model, suggestions=suggestions ) - subtask_data_list.append(subtask_data) + return subtask_data_list def _convert_search_agency_response_to_agency_suggestions( - response: SearchAgencyByLocationResponse, + responses: list[SearchAgencyByLocationResponse], ) -> list[AgencySuggestion]: suggestions: list[AgencySuggestion] = [] - for result in response.results: - agency_id: int = result.agency_id - similarity: float = result.similarity - confidence: int = ceil(similarity * 100) - suggestion: AgencySuggestion = AgencySuggestion( - agency_id=agency_id, - confidence=confidence, - ) - suggestions.append(suggestion) + for response in responses: + for result in response.results: + agency_id: int = result.agency_id + similarity: float = result.similarity + confidence: int = min(ceil(similarity * 100), MAX_NLP_CONFIDENCE) + suggestion: AgencySuggestion = AgencySuggestion( + agency_id=agency_id, + confidence=confidence, + ) + suggestions.append(suggestion) return suggestions +def convert_url_ids_to_empty_subtask_data_list( + url_ids: list[int], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + results: list[AutoAgencyIDSubtaskData] = [] + for url_id in url_ids: + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=False + ), + suggestions=[] + ) + results.append(subtask_data) + + return results + + + def convert_empty_url_search_param_mappings_to_subtask_data_list( mappings: list[URLToSearchParamsMapping], task_id: int ) -> list[AutoAgencyIDSubtaskData]: - results: list[AutoAgencyIDSubtaskData] = [] + url_ids: list[int] = [] for mapping in mappings: - if not mapping.empty: - raise ValueError("URLToSearchParamsMapping expected empty in conversion function.") - subtask_data = AutoAgencyIDSubtaskData( + url_ids.append(mapping.url_id) - ) + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) +def convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings: list[URLToNLPResponseMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) def convert_search_agency_response_to_subtask_pydantic( @@ -108,3 +141,22 @@ def convert_search_agency_response_to_subtask_pydantic( type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, agencies_found=True ) + + +def convert_urls_to_search_params( + url_to_nlp_mappings: list[URLToNLPResponseMapping] +) -> list[URLToSearchParamsMapping]: + url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] + counter = RequestCounter() + for mapping in url_to_nlp_mappings: + search_params: list[SearchAgencyByLocationParams] = \ + convert_nlp_response_to_search_agency_by_location_params( + counter=counter, + nlp_response=mapping.nlp_response, + ) + mapping = URLToSearchParamsMapping( + url_id=mapping.url_id, + search_params=search_params, + ) + url_to_search_params_mappings.append(mapping) + return url_to_search_params_mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py index 4c17a166..1e349318 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -1,20 +1,28 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ - convert_nlp_response_to_search_agency_by_location_params, convert_search_agency_responses_to_subtask_data_list -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import RequestCounter -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.extract import \ - _extract_all_search_params + convert_search_agency_responses_to_subtask_data_list, \ + convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_urls_to_search_params +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.filter import \ + filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ URLRequestIDMapper -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ - NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.preprocess import \ + preprocess_html from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams @@ -31,100 +39,48 @@ def __init__( ): self._nlp_processor = nlp_processor self._pdap_client = pdap_client - self._counter = RequestCounter() - self._mapper = URLRequestIDMapper() self._task_id = task_id async def process( self, inputs: list[NLPLocationMatchSubtaskInput] ) -> list[AutoAgencyIDSubtaskData]: - - url_search_param_mappings: list[URLToSearchParamsMapping] = self._extract_search_params( - inputs=inputs - ) + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] - # Filter out empty params - url_search_param_mappings_empty: list[URLToSearchParamsMapping] = \ - [mapping for mapping in url_search_param_mappings if mapping.is_empty] + url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ + self._match_urls_to_nlp_responses(inputs) + + # Filter out valid and invalid NLP responses + nlp_response_subsets: NLPResponseSubsets = \ + filter_valid_and_invalid_nlp_responses(url_to_nlp_mappings) - # Convert empty params to subtask data with empty agencies + # For invalid responses, convert to subtask data with empty agencies subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ - convert_empty_url_search_param_mappings_to_subtask_data_list( - responses=[], + convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings=nlp_response_subsets.invalid, task_id=self._task_id, - mapper=self._mapper, ) + subtask_data_list.extend(subtask_data_no_agency_list) + # For valid responses, convert to search param mappings + url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ + convert_urls_to_search_params(nlp_response_subsets.valid) + response_mappings: list[URLToSearchResponseMapping] = \ + await self._get_pdap_info(url_to_search_params_mappings) - search_responses: list[SearchAgencyByLocationResponse] = \ - await self._get_pdap_info(search_params) - - subtask_data_list: list[AutoAgencyIDSubtaskData] = \ + subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ convert_search_agency_responses_to_subtask_data_list( - responses=search_responses, + mappings=response_mappings, task_id=self._task_id, - mapper=self._mapper, ) - return subtask_data_list - - def _extract_search_params( - self, - inputs: list[NLPLocationMatchSubtaskInput] - ) -> list[URLToSearchParamsMapping]: - """ - Modifies: - - self._mapper - - self._counter - """ - - url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ - self._match_urls_to_nlp_responses(inputs) - - url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ - self._match_urls_to_search_params(url_to_nlp_mappings) + filter_top_n_suggestions(subtask_data_list_agency_list) - return url_to_search_params_mappings + subtask_data_list.extend(subtask_data_list_agency_list) - def _add_all_url_search_param_mappings( - self, - url_to_search_params_mappings: list[URLToSearchParamsMapping] - ) -> None: - """ - Modifies: - - self._mapper - """ - for mapping in url_to_search_params_mappings: - for search_param in mapping.search_params: - self._mapper.add_mapping( - request_id=search_param.request_id, - url_id=mapping.url_id, - ) - - def _match_urls_to_search_params( - self, - url_to_nlp_mappings: list[URLToNLPResponseMapping] - ) -> list[URLToSearchParamsMapping]: - """ - Modifies: - - self._counter - """ - url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] - for mapping in url_to_nlp_mappings: - search_params: list[SearchAgencyByLocationParams] = \ - convert_nlp_response_to_search_agency_by_location_params( - counter=self._counter, - nlp_response=mapping.nlp_response, - ) - mapping = URLToSearchParamsMapping( - url_id=mapping.url_id, - search_params=search_params, - ) - url_to_search_params_mappings.append(mapping) - return url_to_search_params_mappings + return subtask_data_list def _match_urls_to_nlp_responses( self, @@ -144,10 +100,44 @@ def _get_location_match( self, html: str ) -> NLPLocationMatchResponse: - return self._nlp_processor.parse_for_locations(html) + preprocessed_html: str = preprocess_html(html) + return self._nlp_processor.parse_for_locations(preprocessed_html) async def _get_pdap_info( self, - params: list[SearchAgencyByLocationParams] - ) -> list[SearchAgencyByLocationResponse]: - return await self._pdap_client.search_agency_by_location(params) + mappings: list[URLToSearchParamsMapping] + ) -> list[URLToSearchResponseMapping]: + if len(mappings) == 0: + return [] + params: list[SearchAgencyByLocationParams] = [] + # Map request IDs to URL IDs for later use + mapper = URLRequestIDMapper() + for mapping in mappings: + for search_param in mapping.search_params: + mapper.add_mapping( + request_id=search_param.request_id, + url_id=mapping.url_id, + ) + params.append(search_param) + + url_id_to_search_responses: dict[int, list[SearchAgencyByLocationResponse]] = defaultdict(list) + + responses: list[SearchAgencyByLocationResponse] = await self._pdap_client.search_agency_by_location(params) + # Map responses to URL IDs via request IDs + for response in responses: + request_id: int = response.request_id + url_id: int = mapper.get_url_id_by_request_id(request_id) + url_id_to_search_responses[url_id].append(response) + + # Reconcile URL IDs to search responses + response_mappings: list[URLToSearchResponseMapping] = [] + for url_id, responses in url_id_to_search_responses.items(): + mapping = URLToSearchResponseMapping( + url_id=url_id, + search_responses=responses, + ) + response_mappings.append(mapping) + + return response_mappings + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py new file mode 100644 index 00000000..ff8b2de5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py @@ -0,0 +1,59 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion + + +def filter_valid_and_invalid_nlp_responses( + mappings: list[URLToNLPResponseMapping] +) -> NLPResponseSubsets: + valid: list[URLToNLPResponseMapping] = [] + invalid: list[URLToNLPResponseMapping] = [] + for mapping in mappings: + nlp_response: NLPLocationMatchResponse = mapping.nlp_response + if nlp_response.valid: + valid.append(mapping) + else: + invalid.append(mapping) + return NLPResponseSubsets( + valid=valid, + invalid=invalid, + ) + +def filter_top_n_suggestions( + subtask_data_list: list[AutoAgencyIDSubtaskData], + n: int = 5 +) -> None: + """Filters out all but the top N suggestions for each URL. + + Modifies: + - AutoAgencyIDSubtaskData.suggestions + """ + for subtask_data in subtask_data_list: + # Eliminate agency ID duplicates; + agency_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) + for suggestion in subtask_data.suggestions: + agency_to_suggestions[suggestion.agency_id].append(suggestion) + + # in the case of a tie, keep the suggestion with the highest confidence + deduped_suggestions: list[AgencySuggestion] = [] + for agency_suggestions in agency_to_suggestions.values(): + agency_suggestions.sort( + key=lambda x: x.confidence, + reverse=True # Descending order + ) + deduped_suggestions.append(agency_suggestions[0]) + + # Sort suggestions by confidence and keep top N + suggestions_sorted: list[AgencySuggestion] = sorted( + deduped_suggestions, + key=lambda x: x.confidence, + reverse=True # Descending order + ) + subtask_data.suggestions = suggestions_sorted[:n] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py new file mode 100644 index 00000000..9a88b89d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class URLToSearchResponseMapping(BaseModel): + url_id: int + search_responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py new file mode 100644 index 00000000..22fdcf98 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping + + +class NLPResponseSubsets(BaseModel): + valid: list[URLToNLPResponseMapping] + invalid: list[URLToNLPResponseMapping] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py index 267f728b..8b9076fe 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py @@ -1,3 +1,18 @@ -TOP_N_LOCATIONS_COUNT: int = 5 \ No newline at end of file +TOP_N_LOCATIONS_COUNT: int = 5 + +INVALID_LOCATION_CHARACTERS: set[str] = { + "=", + "\\", + "/", + "\'", + "\"," +} + +# State ISOs that commonly align with other words, +# Which cannot be used in simple text scanning +INVALID_SCAN_ISOS: set[str] = { + "IN", + "OR", +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py index 442585f2..8e723aa6 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py @@ -6,6 +6,8 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.check import \ is_name_us_state, is_iso_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + INVALID_LOCATION_CHARACTERS, INVALID_SCAN_ISOS from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.convert import \ convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ @@ -39,10 +41,27 @@ def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: us_state_counter: Counter[USState] = Counter() location_counter: Counter[str] = Counter() + # Scan over tokens + for token in doc: + upper_token: str = token.text.upper() + # Disregard certain ISOs that align with common words + if upper_token in INVALID_SCAN_ISOS: + continue + if not is_iso_us_state(upper_token): + continue + + us_state: USState | None = convert_us_state_iso_to_us_state(upper_token) + if us_state is not None: + us_state_counter[us_state] += 1 + + + # Scan over entities using spacy for ent in doc.ents: if ent.label_ != "GPE": # Geopolitical Entity continue text: str = ent.text + if any(char in text for char in INVALID_LOCATION_CHARACTERS): + continue if is_name_us_state(text): us_state: USState | None = convert_us_state_name_to_us_state(text) if us_state is not None: diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py index 23904bdf..387e32de 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py @@ -9,9 +9,10 @@ class NLPLocationMatchResponse(BaseModel): us_state: USState | None @property - def empty(self) -> bool: - if self.us_state is not None: + def valid(self) -> bool: + # Valid responses must have a US State and at least one location + if self.us_state is None: return False - if len(self.locations) > 0: + if len(self.locations) == 0: return False return True diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py new file mode 100644 index 00000000..da20f4f4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py @@ -0,0 +1,20 @@ +import re + +import unicodedata +from bs4 import BeautifulSoup + + +def preprocess_html(raw_html: str) -> str: + """Preprocess HTML to extract text content.""" + soup = BeautifulSoup(raw_html, 'lxml') + + # Remove scripts, styles, and other non-textual elements + for tag in soup(['script','style','noscript','iframe','canvas','svg','header','footer','nav','aside']): + tag.decompose() + # Extract text + text = soup.get_text(separator=' ') + # Normalize text and collapse whitespace + text = unicodedata.normalize('NFKC', text) + text = re.sub(r'[ \t\u00A0]+', ' ', text) + text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) + return text.strip() \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py index b4e4b018..4085b6dd 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -1,4 +1,5 @@ import abc +import traceback from abc import ABC from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo @@ -24,8 +25,10 @@ async def run(self) -> AgencyIDSubtaskRunInfo: try: await self.inner_logic() except Exception as e: + # Get stack trace + stack_trace: str = traceback.format_exc() return AgencyIDSubtaskRunInfo( - error=f"{type(e).__name__}: {str(e)}", + error=f"{type(e).__name__}: {str(e)}: {stack_trace}", linked_url_ids=self.linked_urls ) return AgencyIDSubtaskRunInfo( diff --git a/src/db/models/impl/link/batch_url/sqlalchemy.py b/src/db/models/impl/link/batch_url/sqlalchemy.py index 8fb8f42e..951ac539 100644 --- a/src/db/models/impl/link/batch_url/sqlalchemy.py +++ b/src/db/models/impl/link/batch_url/sqlalchemy.py @@ -13,5 +13,3 @@ class LinkBatchURL( ): __tablename__ = "link_batch_urls" - url = relationship('URL', overlaps="batch") - batch = relationship('Batch', overlaps="url") \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 82b337b0..2001f9ed 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -40,7 +40,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): "Batch", secondary="link_batch_urls", back_populates="urls", - uselist=False + uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index 929b88bd..de6ee029 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -25,4 +25,4 @@ class AgencyIDSubtaskSuggestion( nullable=False, ) - agency = relationship("Agency") \ No newline at end of file + agency = relationship("Agency", viewonly=True) \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py index 800fa881..ca5a6213 100644 --- a/src/external/pdap/dtos/search_agency_by_location/params.py +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -1,6 +1,11 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field class SearchAgencyByLocationParams(BaseModel): request_id: int - query: str \ No newline at end of file + query: str + iso: str = Field( + description="US State ISO Code", + max_length=2, + + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py deleted file mode 100644 index f0a27b97..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/params.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse - - -class ConvertSearchAgencyResponsesTestParams(BaseModel): - search_agency_by_location_responses: list[SearchAgencyByLocationResponse] - expected_subtask_data: AutoAgencyIDSubtaskData diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py deleted file mode 100644 index 1d36d120..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/convert_search_agency_responses/test_core.py +++ /dev/null @@ -1,127 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ - convert_search_agency_responses_to_subtask_data_list -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ - URLRequestIDMapper -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ - SearchAgencyByLocationAgencyInfo -from tests.automated.integration.tasks.url.impl.agency_identification.subtasks.nlp_location_match.internal_processor.convert_search_agency_responses.params import \ - ConvertSearchAgencyResponsesTestParams - -PARAMETERS = [ - ConvertSearchAgencyResponsesTestParams( - search_agency_by_location_responses=[ - SearchAgencyByLocationResponse( - request_id=1, - results=[ - SearchAgencyByLocationAgencyInfo( - agency_id=1, - similarity=1.0, - ), - SearchAgencyByLocationAgencyInfo( - agency_id=2, - similarity=0.5, - ), - ] - ), - SearchAgencyByLocationResponse( - request_id=2, - results=[ - SearchAgencyByLocationAgencyInfo( - agency_id=3, - similarity=0.75, - ), - ] - ) - ], - expected_subtask_data=AutoAgencyIDSubtaskData( - pydantic_model=URLAutoAgencyIDSubtaskPydantic( - task_id=1, - url_id=1, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=True, - ), - suggestions=[ - AgencySuggestion( - agency_id=1, - confidence=100, - ), - AgencySuggestion( - agency_id=2, - confidence=50, - ), - AgencySuggestion( - agency_id=3, - confidence=75, - ) - ] - ) - ), - ConvertSearchAgencyResponsesTestParams( - search_agency_by_location_responses=[ - SearchAgencyByLocationResponse( - request_id=3, - results=[ - SearchAgencyByLocationAgencyInfo( - agency_id=1, - similarity=1.0, - ), - SearchAgencyByLocationAgencyInfo( - agency_id=2, - similarity=0.5, - ), - ] - ) - ], - expected_subtask_data=AutoAgencyIDSubtaskData( - pydantic_model=URLAutoAgencyIDSubtaskPydantic( - task_id=1, - url_id=2, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=True, - ), - suggestions=[ - AgencySuggestion( - agency_id=1, - confidence=100, - ), - AgencySuggestion( - agency_id=2, - confidence=50, - ) - ] - ) - ), -] - -@pytest.mark.asyncio -async def test_params( -) -> None: - mapper = URLRequestIDMapper() - mapper.add_mapping(request_id=1, url_id=1) - mapper.add_mapping(request_id=2, url_id=1) - mapper.add_mapping(request_id=3, url_id=2) - - search_responses: list[SearchAgencyByLocationResponse] = [] - for param in PARAMETERS: - search_responses.extend(param.search_agency_by_location_responses) - - subtask_data_list: list[AutoAgencyIDSubtaskData] = \ - convert_search_agency_responses_to_subtask_data_list( - responses=search_responses, - task_id=1, - mapper=mapper, - ) - - assert len(subtask_data_list) == len(PARAMETERS) - - for subtask_data, param in zip(subtask_data_list, PARAMETERS): - expected_subtask_data: AutoAgencyIDSubtaskData = param.expected_subtask_data - assert subtask_data.pydantic_model == expected_subtask_data.pydantic_model - assert subtask_data.suggestions == expected_subtask_data.suggestions - diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py new file mode 100644 index 00000000..ea81341c --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py @@ -0,0 +1,57 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + +US_STATE = USState( + name="Pennsylvania", + iso="PA", +) + +SINGLE_LOCATION: list[str] = ["Pittsburgh"] +MULTIPLE_LOCATION: list[str] = ["Pittsburgh", "Allegheny"] + +@pytest.mark.parametrize( + argnames="nlp_response, expected_result", + argvalues=[ + ( + NLPLocationMatchResponse( + locations=SINGLE_LOCATION, + us_state=US_STATE + ), + True, + ), + ( + NLPLocationMatchResponse( + locations=MULTIPLE_LOCATION, + us_state=US_STATE, + ), + True + ), + ( + NLPLocationMatchResponse( + locations=MULTIPLE_LOCATION, + us_state=None, + ), + False, + ), + ( + NLPLocationMatchResponse( + locations=[], + us_state=US_STATE, + ), + False, + ), + ( + NLPLocationMatchResponse( + locations=[], + us_state=None, + ), + False + ) + ], +) +def test_nlp_response_valid(nlp_response: NLPLocationMatchResponse, expected_result: bool): + assert nlp_response.valid == expected_result \ No newline at end of file