From 152c02270998759a30ad58b187d6c1fa56bfd803 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 22 Jul 2025 08:03:13 -0400 Subject: [PATCH 1/6] Continue draft --- ...1bab33_setup_for_sync_data_sources_task.py | 92 +++++++++++ .../queries/get_annotation_batch_info.py | 2 +- .../get_next_url_for_user_annotation.py | 2 +- .../agency/get/queries/agency_suggestion.py | 2 +- .../agency/get/queries/next_for_annotation.py | 2 +- src/api/endpoints/annotate/all/get/query.py | 3 +- .../endpoints/annotate/relevance/get/query.py | 3 +- src/api/endpoints/batch/dtos/get/logs.py | 2 +- src/api/endpoints/batch/duplicates/dto.py | 2 +- src/api/endpoints/batch/duplicates/query.py | 8 +- src/api/endpoints/batch/urls/dto.py | 2 +- src/api/endpoints/batch/urls/query.py | 4 +- src/api/endpoints/collector/manual/query.py | 4 +- .../metrics/batches/aggregated/query.py | 4 +- .../metrics/batches/breakdown/query.py | 4 +- src/api/endpoints/review/approve/query.py | 4 +- src/api/endpoints/review/next/query.py | 8 +- src/api/endpoints/review/reject/query.py | 2 +- src/api/endpoints/task/by_id/dto.py | 4 +- src/api/endpoints/task/by_id/query.py | 6 +- src/api/endpoints/url/get/query.py | 4 +- src/collectors/source_collectors/base.py | 2 +- src/core/core.py | 2 +- src/core/logger.py | 2 +- src/core/preprocessors/autogoogler.py | 2 +- src/core/preprocessors/base.py | 2 +- src/core/preprocessors/ckan.py | 2 +- src/core/preprocessors/common_crawler.py | 2 +- src/core/preprocessors/example.py | 2 +- src/core/preprocessors/muckrock.py | 2 +- src/core/tasks/scheduled/loader.py | 14 +- src/core/tasks/scheduled/manager.py | 13 +- .../scheduled/{operators => sync}/__init__.py | 0 .../agency_sync => sync/agency}/__init__.py | 0 .../agency}/dtos/__init__.py | 0 .../agency}/dtos/parameters.py | 4 +- .../core.py => sync/agency/operator.py} | 12 +- .../sync/agency/queries}/__init__.py | 0 .../sync/agency/queries/get_sync_params.py | 30 ++++ .../sync/agency/queries/mark_full_sync.py | 13 ++ .../agency/queries/update_sync_progress.py | 11 ++ .../scheduled/sync/agency/queries/upsert.py | 20 +++ src/core/tasks/scheduled/sync/check.py | 14 ++ .../agency_sync => sync}/constants.py | 0 .../scheduled/sync/data_sources}/__init__.py | 0 .../sync/data_sources/dtos}/__init__.py | 0 .../sync/data_sources/dtos/parameters.py | 8 + .../scheduled/sync/data_sources/operator.py | 43 +++++ .../sync/data_sources/queries}/__init__.py | 0 .../data_sources/queries/get_sync_params.py | 27 ++++ .../data_sources/queries/mark_full_sync.py | 13 ++ .../queries/update_sync_progress.py | 11 ++ .../sync/data_sources/queries/upsert.py | 53 +++++++ .../agency_sync => sync}/exceptions.py | 0 .../tasks/scheduled/templates}/__init__.py | 0 .../base.py => templates/operator.py} | 0 .../operators/agency_identification/core.py | 2 +- ...pending_urls_without_agency_suggestions.py | 6 +- .../tasks/url/operators/auto_relevant/core.py | 4 +- .../auto_relevant/queries/get_tdos.py | 4 +- .../tasks/url/operators/record_type/core.py | 2 +- .../url/operators/submit_approved_url/core.py | 2 +- src/core/tasks/url/operators/url_html/core.py | 4 +- .../get_pending_urls_without_html_data.py | 5 +- src/core/tasks/url/operators/url_html/tdo.py | 2 +- .../url_miscellaneous_metadata/core.py | 2 +- ...pending_urls_missing_miscellaneous_data.py | 4 +- src/db/client/async_.py | 147 ++++++++++-------- src/db/client/sync.py | 16 +- src/db/client/types.py | 2 +- src/db/constants.py | 2 +- src/db/dto_converter.py | 5 +- src/db/dtos/duplicate.py | 12 -- src/db/dtos/metadata_annotation.py | 11 -- src/db/dtos/url/metadata.py | 19 --- src/db/enums.py | 1 + src/db/models/helpers.py | 2 +- .../models/instantiations/agency/__init__.py | 0 .../agency/pydantic/__init__.py | 0 .../instantiations/agency/pydantic/upsert.py | 23 +++ .../{agency.py => agency/sqlalchemy.py} | 0 .../models/instantiations/batch/__init__.py | 0 .../instantiations/batch/pydantic.py} | 0 .../{batch.py => batch/sqlalchemy.py} | 0 .../instantiations/confirmed_url_agency.py | 6 +- .../instantiations/duplicate/__init__.py | 0 .../duplicate/pydantic/__init__.py | 0 .../instantiations/duplicate/pydantic/info.py | 8 + .../duplicate/pydantic/insert.py | 7 + .../{duplicate.py => duplicate/sqlalchemy.py} | 0 src/db/models/instantiations/log/__init__.py | 0 .../instantiations/log/pydantic/__init__.py | 0 .../instantiations/log/pydantic/info.py} | 5 - .../instantiations/log/pydantic/output.py | 10 ++ .../{log.py => log/sqlalchemy.py} | 0 .../instantiations/sync_state/__init__.py | 0 .../agencies.py} | 0 .../instantiations/sync_state/data_sources.py | 28 ++++ .../instantiations/url/core/__init__.py | 0 .../url/core/pydantic/__init__.py | 0 .../instantiations/url/core/pydantic/info.py} | 0 .../url/core/pydantic/upsert.py | 24 +++ .../url/{core.py => core/sqlalchemy.py} | 0 .../instantiations/url/error_info/__init__.py | 0 .../url/error_info/pydantic.py} | 0 .../sqlalchemy.py} | 0 .../url/suggestion/relevant/auto/__init__.py | 0 .../relevant/auto/pydantic/__init__.py | 0 .../relevant/auto/pydantic/input.py} | 0 .../relevant/{auto.py => auto/sqlalchemy.py} | 0 .../core/common/annotation_exists.py | 2 +- .../get/recent_batch_summaries/builder.py | 2 +- .../url_counts/builder.py | 4 +- .../core/metrics/urls/aggregated/pending.py | 4 +- .../core/tasks/agency_sync/upsert.py | 19 --- src/db/statement_composer.py | 4 +- src/db/templates/__init__.py | 0 src/db/templates/upsert.py | 20 +++ src/external/pdap/client.py | 36 ++++- src/external/pdap/dtos/sync/__init__.py | 0 .../{agencies_sync.py => sync/agencies.py} | 0 src/external/pdap/dtos/sync/data_sources.py | 21 +++ src/external/pdap/enums.py | 6 + .../api/review/rejection/helpers.py | 2 +- .../test_approve_and_get_next_source.py | 4 +- tests/automated/integration/api/test_batch.py | 2 +- .../integration/api/test_example_collector.py | 2 +- .../integration/api/test_manual_batch.py | 4 +- .../annotate_url/test_agency_not_in_db.py | 2 +- .../db/client/approve_url/test_basic.py | 2 +- .../db/client/test_add_url_error_info.py | 2 +- .../db/client/test_delete_old_logs.py | 2 +- .../db/client/test_delete_url_updated_at.py | 2 +- .../integration/db/client/test_insert_logs.py | 2 +- .../integration/db/client/test_insert_urls.py | 4 +- .../integration/db/test_database_structure.py | 2 +- .../tasks/scheduled/sync/__init__.py | 0 .../tasks/scheduled/sync/agency/__init__.py | 0 .../{agency_sync => sync/agency}/conftest.py | 4 +- .../{agency_sync => sync/agency}/data.py | 2 +- .../agency}/existence_checker.py | 6 +- .../{agency_sync => sync/agency}/helpers.py | 6 +- .../agency}/test_happy_path.py | 12 +- .../agency}/test_interruption.py | 12 +- .../agency}/test_no_new_results.py | 14 +- .../scheduled/sync/data_sources/__init__.py | 0 .../scheduled/sync/data_sources/conftest.py | 11 ++ .../tasks/scheduled/sync/data_sources/data.py | 2 + .../sync/data_sources/existence_checker.py | 5 + .../sync/data_sources/setup/__init__.py | 0 .../scheduled/sync/data_sources/setup/core.py | 131 ++++++++++++++++ .../scheduled/sync/data_sources/setup/info.py | 16 ++ .../sync/data_sources/test_happy_path.py | 0 .../sync/data_sources/test_interruption.py | 0 .../sync/data_sources/test_no_new_results.py | 0 .../tasks/url/auto_relevant/test_task.py | 6 +- .../url/duplicate/test_url_duplicate_task.py | 2 +- .../url/test_agency_preannotation_task.py | 2 +- .../url/test_submit_approved_url_task.py | 4 +- .../tasks/url/test_url_404_probe.py | 2 +- .../test_url_miscellaneous_metadata_task.py | 2 +- tests/automated/unit/core/test_core_logger.py | 2 +- .../test_autogoogler_collector.py | 2 +- .../test_common_crawl_collector.py | 2 +- .../test_muckrock_collectors.py | 2 +- tests/helpers/db_data_creator.py | 10 +- tests/helpers/setup/populate.py | 2 +- .../lifecycle/test_auto_googler_lifecycle.py | 2 +- .../core/lifecycle/test_ckan_lifecycle.py | 2 +- .../lifecycle/test_muckrock_lifecycles.py | 2 +- .../external/pdap/test_sync_agencies.py | 2 +- .../test_html_tag_collector_integration.py | 2 +- 172 files changed, 944 insertions(+), 308 deletions(-) create mode 100644 alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py rename src/core/tasks/scheduled/{operators => sync}/__init__.py (100%) rename src/core/tasks/scheduled/{operators/agency_sync => sync/agency}/__init__.py (100%) rename src/core/tasks/scheduled/{operators/agency_sync => sync/agency}/dtos/__init__.py (100%) rename src/core/tasks/scheduled/{operators/agency_sync => sync/agency}/dtos/parameters.py (69%) rename src/core/tasks/scheduled/{operators/agency_sync/core.py => sync/agency/operator.py} (68%) rename src/{db/dtos/url/annotations => core/tasks/scheduled/sync/agency/queries}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py create mode 100644 src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py create mode 100644 src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py create mode 100644 src/core/tasks/scheduled/sync/agency/queries/upsert.py create mode 100644 src/core/tasks/scheduled/sync/check.py rename src/core/tasks/scheduled/{operators/agency_sync => sync}/constants.py (100%) rename src/{db/dtos/url/annotations/auto => core/tasks/scheduled/sync/data_sources}/__init__.py (100%) rename src/{db/queries/implementations/core/tasks => core/tasks/scheduled/sync/data_sources/dtos}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/operator.py rename src/{db/queries/implementations/core/tasks/agency_sync => core/tasks/scheduled/sync/data_sources/queries}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert.py rename src/core/tasks/scheduled/{operators/agency_sync => sync}/exceptions.py (100%) rename {tests/automated/integration/tasks/scheduled/agency_sync => src/core/tasks/scheduled/templates}/__init__.py (100%) rename src/core/tasks/scheduled/{operators/base.py => templates/operator.py} (100%) delete mode 100644 src/db/dtos/duplicate.py delete mode 100644 src/db/dtos/metadata_annotation.py delete mode 100644 src/db/dtos/url/metadata.py create mode 100644 src/db/models/instantiations/agency/__init__.py create mode 100644 src/db/models/instantiations/agency/pydantic/__init__.py create mode 100644 src/db/models/instantiations/agency/pydantic/upsert.py rename src/db/models/instantiations/{agency.py => agency/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/batch/__init__.py rename src/db/{dtos/batch.py => models/instantiations/batch/pydantic.py} (100%) rename src/db/models/instantiations/{batch.py => batch/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/duplicate/__init__.py create mode 100644 src/db/models/instantiations/duplicate/pydantic/__init__.py create mode 100644 src/db/models/instantiations/duplicate/pydantic/info.py create mode 100644 src/db/models/instantiations/duplicate/pydantic/insert.py rename src/db/models/instantiations/{duplicate.py => duplicate/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/log/__init__.py create mode 100644 src/db/models/instantiations/log/pydantic/__init__.py rename src/db/{dtos/log.py => models/instantiations/log/pydantic/info.py} (65%) create mode 100644 src/db/models/instantiations/log/pydantic/output.py rename src/db/models/instantiations/{log.py => log/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/sync_state/__init__.py rename src/db/models/instantiations/{sync_state_agencies.py => sync_state/agencies.py} (100%) create mode 100644 src/db/models/instantiations/sync_state/data_sources.py create mode 100644 src/db/models/instantiations/url/core/__init__.py create mode 100644 src/db/models/instantiations/url/core/pydantic/__init__.py rename src/db/{dtos/url/core.py => models/instantiations/url/core/pydantic/info.py} (100%) create mode 100644 src/db/models/instantiations/url/core/pydantic/upsert.py rename src/db/models/instantiations/url/{core.py => core/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/url/error_info/__init__.py rename src/db/{dtos/url/error.py => models/instantiations/url/error_info/pydantic.py} (100%) rename src/db/models/instantiations/url/{error_info.py => error_info/sqlalchemy.py} (100%) create mode 100644 src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py create mode 100644 src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py rename src/db/{dtos/url/annotations/auto/relevancy.py => models/instantiations/url/suggestion/relevant/auto/pydantic/input.py} (100%) rename src/db/models/instantiations/url/suggestion/relevant/{auto.py => auto/sqlalchemy.py} (100%) delete mode 100644 src/db/queries/implementations/core/tasks/agency_sync/upsert.py create mode 100644 src/db/templates/__init__.py create mode 100644 src/db/templates/upsert.py create mode 100644 src/external/pdap/dtos/sync/__init__.py rename src/external/pdap/dtos/{agencies_sync.py => sync/agencies.py} (100%) create mode 100644 src/external/pdap/dtos/sync/data_sources.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/agency/__init__.py rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/conftest.py (74%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/data.py (97%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/existence_checker.py (80%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/helpers.py (92%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/test_happy_path.py (77%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/test_interruption.py (84%) rename tests/automated/integration/tasks/scheduled/{agency_sync => sync/agency}/test_no_new_results.py (74%) create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/data.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py diff --git a/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py new file mode 100644 index 00000000..07a51dc4 --- /dev/null +++ b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py @@ -0,0 +1,92 @@ +"""Setup for sync data sources task + +Revision ID: 59d2af1bab33 +Revises: 9552d354ccf4 +Create Date: 2025-07-21 06:37:51.043504 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import switch_enum_type, id_column + +# revision identifiers, used by Alembic. +revision: str = '59d2af1bab33' +down_revision: Union[str, None] = '9552d354ccf4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +SYNC_STATE_TABLE_NAME = "data_sources_sync_state" +URL_DATA_SOURCES_METADATA_TABLE_NAME = "url_data_sources_metadata" + +def _create_data_sources_sync_state_table() -> None: + table = op.create_table( + SYNC_STATE_TABLE_NAME, + id_column(), + sa.Column('last_full_sync_at', sa.DateTime(), nullable=True), + sa.Column('current_cutoff_date', sa.Date(), nullable=True), + sa.Column('current_page', sa.Integer(), nullable=True), + ) + # Add row to `data_sources_sync_state` table + op.bulk_insert( + table, + [ + { + "last_full_sync_at": None, + "current_cutoff_date": None, + "current_page": None + } + ] + ) + +def _drop_data_sources_sync_state_table() -> None: + op.drop_table(SYNC_STATE_TABLE_NAME) + +def _create_data_sources_sync_task() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources' + ] + ) + +def _drop_data_sources_sync_task() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + ] + ) + + +def upgrade() -> None: + _create_data_sources_sync_state_table() + _create_data_sources_sync_task() + + +def downgrade() -> None: + _drop_data_sources_sync_task() + _drop_data_sources_sync_state_table() diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 15f5b631..1bab0fdf 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -6,7 +6,7 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.types import UserSuggestionType diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index 3bda8ff3..8cadb337 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -6,7 +6,7 @@ from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py index f1ab8b67..14a00260 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py @@ -3,7 +3,7 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.core.enums import SuggestionType -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 5bfd6e8a..fcc103ac 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -11,7 +11,7 @@ from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index 1191e8d6..7ce8a94f 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -4,7 +4,6 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder -from src.api.endpoints.annotate.agency.get.queries.next_for_annotation import GetNextURLAgencyForAnnotationQueryBuilder from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo @@ -12,7 +11,7 @@ from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/api/endpoints/annotate/relevance/get/query.py b/src/api/endpoints/annotate/relevance/get/query.py index ffd37d2c..11e509d0 100644 --- a/src/api/endpoints/annotate/relevance/get/query.py +++ b/src/api/endpoints/annotate/relevance/get/query.py @@ -5,10 +5,9 @@ GetNextURLForUserAnnotationQueryBuilder from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseInfo, \ RelevanceAnnotationResponseInfo -from src.core.tasks.url.operators.auto_relevant.models.annotation import RelevanceAnnotationInfo from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/dtos/get/logs.py b/src/api/endpoints/batch/dtos/get/logs.py index a350caa1..437e53cd 100644 --- a/src/api/endpoints/batch/dtos/get/logs.py +++ b/src/api/endpoints/batch/dtos/get/logs.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.log import LogOutputInfo +from src.db.models.instantiations.log.pydantic.output import LogOutputInfo class GetBatchLogsResponse(BaseModel): diff --git a/src/api/endpoints/batch/duplicates/dto.py b/src/api/endpoints/batch/duplicates/dto.py index 3838be77..b3fe5f17 100644 --- a/src/api/endpoints/batch/duplicates/dto.py +++ b/src/api/endpoints/batch/duplicates/dto.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from src.db.dtos.duplicate import DuplicateInfo +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo class GetDuplicatesByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index a4c3aa31..389cfa8a 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -2,11 +2,11 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import aliased -from src.db.dtos.duplicate import DuplicateInfo -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.duplicate import Duplicate +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 40b1e753..90f9b209 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index fcfba3ee..40aa5935 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,9 +1,9 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 2f29a357..8008dc5b 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -5,9 +5,9 @@ from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py index 12616a22..c644a742 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesAggregatedInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 771543ac..36914e29 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesBreakdownInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/review/approve/query.py b/src/api/endpoints/review/approve/query.py index bff32bf3..c562fc43 100644 --- a/src/api/endpoints/review/approve/query.py +++ b/src/api/endpoints/review/approve/query.py @@ -9,9 +9,9 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 8f7d5e35..527ab1c4 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -1,6 +1,6 @@ from typing import Optional, Type -from sqlalchemy import FromClause, select, and_, Select, desc, asc, func, join +from sqlalchemy import FromClause, select, and_, Select, desc, asc, func from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload @@ -8,14 +8,14 @@ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.collectors.enums import URLStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info -from src.db.constants import USER_ANNOTATION_MODELS, ALL_ANNOTATION_MODELS +from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 50bee0bc..e7afa439 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index 411ad7f7..65fa74c5 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -3,8 +3,8 @@ from pydantic import BaseModel -from src.db.dtos.url.error import URLErrorPydanticInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.enums import TaskType from src.core.enums import BatchStatus diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index a57b9daf..c2b32234 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,11 +5,11 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.dtos.url.core import URLInfo -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index 1ba5a75f..8bdb97bd 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -5,8 +5,8 @@ from src.api.endpoints.url.get.dto import GetURLsResponseInfo, GetURLsResponseErrorInfo, GetURLsResponseInnerInfo from src.collectors.enums import URLStatus from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.error_info import URLErrorInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/collectors/source_collectors/base.py b/src/collectors/source_collectors/base.py index 5fbb08c5..32cd3a48 100644 --- a/src/collectors/source_collectors/base.py +++ b/src/collectors/source_collectors/base.py @@ -8,7 +8,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger from src.core.function_trigger import FunctionTrigger diff --git a/src/core/core.py b/src/core/core.py index 78554b39..0b649b05 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager diff --git a/src/core/logger.py b/src/core/logger.py index e49dd057..804edffd 100644 --- a/src/core/logger.py +++ b/src/core/logger.py @@ -1,7 +1,7 @@ import asyncio from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo class AsyncCoreLogger: diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index e827c77d..8163115c 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index dea8df10..2f777d5f 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index c07d4ab5..0b1cef2e 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 9a7e1d04..57457ed4 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index dfc7338a..e357d2a2 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index 281ea2f8..7952ee56 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index fb92dcb0..bd2e4b84 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -1,4 +1,5 @@ -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient @@ -15,9 +16,14 @@ def __init__( self.pdap_client = pdap_client - async def get_sync_agencies_task_operator(self): - operator = SyncAgenciesTaskOperator( + async def get_sync_agencies_task_operator(self) -> SyncAgenciesTaskOperator: + return SyncAgenciesTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + + async def get_sync_data_sources_task_operator(self) -> SyncDataSourcesTaskOperator: + return SyncDataSourcesTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ) - return operator \ No newline at end of file diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index 44576cfa..66b50535 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -6,7 +6,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -from src.core.tasks.scheduled.operators.base import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase class AsyncScheduledTaskManager: @@ -30,6 +30,7 @@ def __init__( self.delete_logs_job = None self.populate_backlog_snapshot_job = None self.sync_agencies_job = None + self.sync_data_sources_job = None async def setup(self): self.scheduler.start() @@ -68,6 +69,16 @@ async def add_scheduled_tasks(self): "operator": await self.loader.get_sync_agencies_task_operator() } ) + self.sync_data_sources_job = self.scheduler.add_job( + self.run_task, + trigger=IntervalTrigger( + days=1, + start_date=datetime.now() + timedelta(minutes=3) + ), + kwargs={ + "operator": await self.loader.get_sync_data_sources_task_operator() + } + ) def shutdown(self): if self.scheduler.running: diff --git a/src/core/tasks/scheduled/operators/__init__.py b/src/core/tasks/scheduled/sync/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/__init__.py rename to src/core/tasks/scheduled/sync/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/__init__.py b/src/core/tasks/scheduled/sync/agency/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/__init__.py rename to src/core/tasks/scheduled/sync/agency/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/dtos/__init__.py b/src/core/tasks/scheduled/sync/agency/dtos/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/dtos/__init__.py rename to src/core/tasks/scheduled/sync/agency/dtos/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py b/src/core/tasks/scheduled/sync/agency/dtos/parameters.py similarity index 69% rename from src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py rename to src/core/tasks/scheduled/sync/agency/dtos/parameters.py index 3d8cceb4..5afa53f1 100644 --- a/src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py +++ b/src/core/tasks/scheduled/sync/agency/dtos/parameters.py @@ -5,5 +5,5 @@ class AgencySyncParameters(BaseModel): - cutoff_date: Optional[date] - page: Optional[int] + cutoff_date: date | None + page: int | None diff --git a/src/core/tasks/scheduled/operators/agency_sync/core.py b/src/core/tasks/scheduled/sync/agency/operator.py similarity index 68% rename from src/core/tasks/scheduled/operators/agency_sync/core.py rename to src/core/tasks/scheduled/sync/agency/operator.py index c522effd..7b8c1a80 100644 --- a/src/core/tasks/scheduled/operators/agency_sync/core.py +++ b/src/core/tasks/scheduled/sync/agency/operator.py @@ -1,7 +1,6 @@ -from src.core.tasks.scheduled.operators.agency_sync.constants import MAX_SYNC_REQUESTS -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.operators.agency_sync.exceptions import MaxRequestsExceededError -from src.core.tasks.scheduled.operators.base import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.sync.check import check_max_sync_requests_not_exceeded +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient @@ -29,10 +28,7 @@ async def inner_task_logic(self): response = await self.pdap_client.sync_agencies(params) request_count = 1 while len(response.agencies) > 0: - if request_count > MAX_SYNC_REQUESTS: - raise MaxRequestsExceededError( - f"Max requests in a single task run ({MAX_SYNC_REQUESTS}) exceeded." - ) + check_max_sync_requests_not_exceeded(request_count) await self.adb_client.upsert_agencies(response.agencies) params = AgencySyncParameters( diff --git a/src/db/dtos/url/annotations/__init__.py b/src/core/tasks/scheduled/sync/agency/queries/__init__.py similarity index 100% rename from src/db/dtos/url/annotations/__init__.py rename to src/core/tasks/scheduled/sync/agency/queries/__init__.py diff --git a/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py b/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py new file mode 100644 index 00000000..8ff148e8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py @@ -0,0 +1,30 @@ +from sqlalchemy import select +from sqlalchemy.exc import NoResultFound +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgenciesSyncParametersQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AgencySyncParameters: + query = select( + AgenciesSyncState.current_page, + AgenciesSyncState.current_cutoff_date + ) + try: + result = (await session.execute(query)).mappings().one() + return AgencySyncParameters( + page=result['current_page'], + cutoff_date=result['current_cutoff_date'] + ) + except NoResultFound: + # Add value + state = AgenciesSyncState() + session.add(state) + return AgencySyncParameters(page=None, cutoff_date=None) + + + diff --git a/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py b/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py new file mode 100644 index 00000000..50e7642c --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py @@ -0,0 +1,13 @@ +from sqlalchemy import update, func, text, Update + +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState + + +def get_mark_full_agencies_sync_query() -> Update: + return update( + AgenciesSyncState + ).values( + last_full_sync_at=func.now(), + current_cutoff_date=func.now() - text('interval \'1 day\''), + current_page=None + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py b/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py new file mode 100644 index 00000000..2055bdc9 --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py @@ -0,0 +1,11 @@ +from sqlalchemy import Update, update + +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState + + +def get_update_agencies_sync_progress_query(page: int) -> Update: + return update( + AgenciesSyncState + ).values( + current_page=page + ) diff --git a/src/core/tasks/scheduled/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/sync/agency/queries/upsert.py new file mode 100644 index 00000000..64988cba --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/upsert.py @@ -0,0 +1,20 @@ +from src.db.models.instantiations.agency.pydantic.upsert import AgencyUpsertModel +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def convert_agencies_sync_response_to_agencies_upsert( + agencies: list[AgenciesSyncResponseInnerInfo] +) -> list[AgencyUpsertModel]: + results = [] + for agency in agencies: + results.append( + AgencyUpsertModel( + agency_id=agency.agency_id, + name=agency.display_name, + state=agency.state_name, + county=agency.county_name, + locality=agency.locality_name, + ds_last_updated_at=agency.updated_at + ) + ) + return results \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/check.py b/src/core/tasks/scheduled/sync/check.py new file mode 100644 index 00000000..449506c5 --- /dev/null +++ b/src/core/tasks/scheduled/sync/check.py @@ -0,0 +1,14 @@ +from src.core.tasks.scheduled.sync.constants import MAX_SYNC_REQUESTS +from src.core.tasks.scheduled.sync.exceptions import MaxRequestsExceededError + + +def check_max_sync_requests_not_exceeded(request_count: int) -> None: + """ + Raises: + MaxRequestsExceededError: If the number of requests made exceeds the maximum allowed. + """ + + if request_count > MAX_SYNC_REQUESTS: + raise MaxRequestsExceededError( + f"Max requests in a single task run ({MAX_SYNC_REQUESTS}) exceeded." + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/operators/agency_sync/constants.py b/src/core/tasks/scheduled/sync/constants.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/constants.py rename to src/core/tasks/scheduled/sync/constants.py diff --git a/src/db/dtos/url/annotations/auto/__init__.py b/src/core/tasks/scheduled/sync/data_sources/__init__.py similarity index 100% rename from src/db/dtos/url/annotations/auto/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/__init__.py diff --git a/src/db/queries/implementations/core/tasks/__init__.py b/src/core/tasks/scheduled/sync/data_sources/dtos/__init__.py similarity index 100% rename from src/db/queries/implementations/core/tasks/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/dtos/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py b/src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py new file mode 100644 index 00000000..8a502ef6 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py @@ -0,0 +1,8 @@ +from datetime import date + +from pydantic import BaseModel + + +class DataSourcesSyncParameters(BaseModel): + cutoff_date: date | None + page: int | None diff --git a/src/core/tasks/scheduled/sync/data_sources/operator.py b/src/core/tasks/scheduled/sync/data_sources/operator.py new file mode 100644 index 00000000..57b12663 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/operator.py @@ -0,0 +1,43 @@ +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.sync.check import check_max_sync_requests_not_exceeded +from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.external.pdap.client import PDAPClient + + +class SyncDataSourcesTaskOperator(ScheduledTaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + + @property + def task_type(self): + return TaskType.SYNC_DATA_SOURCES + + async def inner_task_logic(self): + params = await self.adb_client.get_data_sources_sync_parameters() + if params.page is None: + params.page = 1 + + response = await self.pdap_client.sync_data_sources(params) + request_count = 1 + while len(response.data_sources) > 0: + check_max_sync_requests_not_exceeded(request_count) + await self.adb_client.upsert_urls_from_data_sources(response.data_sources) + + params = DataSourcesSyncParameters( + page=params.page + 1, + cutoff_date=params.cutoff_date + ) + await self.adb_client.update_data_sources_sync_progress(params.page) + + response = await self.pdap_client.sync_data_sources(params) + request_count += 1 + + await self.adb_client.mark_full_data_sources_sync() diff --git a/src/db/queries/implementations/core/tasks/agency_sync/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/__init__.py similarity index 100% rename from src/db/queries/implementations/core/tasks/agency_sync/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py new file mode 100644 index 00000000..4f2efe06 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py @@ -0,0 +1,27 @@ +from sqlalchemy import select +from sqlalchemy.exc import NoResultFound +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourcesSyncParametersQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> DataSourcesSyncParameters: + query = select( + DataSourcesSyncState.current_page, + DataSourcesSyncState.current_cutoff_date + ) + try: + result = (await session.execute(query)).mappings().one() + return DataSourcesSyncParameters( + page=result['current_page'], + cutoff_date=result['current_cutoff_date'] + ) + except NoResultFound: + # Add value + state = DataSourcesSyncState() + session.add(state) + return DataSourcesSyncParameters(page=None, cutoff_date=None) diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py new file mode 100644 index 00000000..8aa34c60 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py @@ -0,0 +1,13 @@ +from sqlalchemy import Update, update, func, text + +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState + + +def get_mark_full_data_sources_sync_query() -> Update: + return update( + AgenciesSyncState + ).values( + last_full_sync_at=func.now(), + current_cutoff_date=func.now() - text('interval \'1 day\''), + current_page=None + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py b/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py new file mode 100644 index 00000000..d6ba80e8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py @@ -0,0 +1,11 @@ +from sqlalchemy import update, Update + +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState + + +def get_update_data_sources_sync_progress_query(page: int) -> Update: + return update( + DataSourcesSyncState + ).values( + current_page=page + ) diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py new file mode 100644 index 00000000..d0fe2542 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py @@ -0,0 +1,53 @@ +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.core.pydantic.upsert import URLUpsertModel +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus + + + +def convert_data_sources_sync_response_to_url_upsert( + data_sources: list[DataSourcesSyncResponseInnerInfo] +) -> list[URLUpsertModel]: + results = [] + for data_source in data_sources: + results.append( + URLUpsertModel( + id=data_source.id, + url=data_source.url, + name=data_source.name, + description=data_source.description, + outcome=_convert_to_source_collector_url_status( + ds_url_status=data_source.url_status, + ds_approval_status=data_source.approval_status + ), + record_type=data_source.record_type + ) + ) + return results + + +def _convert_to_source_collector_url_status( + ds_url_status: DataSourcesURLStatus, + ds_approval_status: ApprovalStatus +) -> URLStatus: + match ds_url_status: + case DataSourcesURLStatus.AVAILABLE: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.NONE_FOUND: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.BROKEN: + return URLStatus.NOT_FOUND + case _: + pass + + match ds_approval_status: + case ApprovalStatus.APPROVED: + return URLStatus.VALIDATED + case ApprovalStatus.REJECTED: + return URLStatus.NOT_RELEVANT + case ApprovalStatus.NEEDS_IDENTIFICATION: + return URLStatus.PENDING + case ApprovalStatus.PENDING: + return URLStatus.PENDING + case _: + raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") diff --git a/src/core/tasks/scheduled/operators/agency_sync/exceptions.py b/src/core/tasks/scheduled/sync/exceptions.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/exceptions.py rename to src/core/tasks/scheduled/sync/exceptions.py diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/__init__.py b/src/core/tasks/scheduled/templates/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/agency_sync/__init__.py rename to src/core/tasks/scheduled/templates/__init__.py diff --git a/src/core/tasks/scheduled/operators/base.py b/src/core/tasks/scheduled/templates/operator.py similarity index 100% rename from src/core/tasks/scheduled/operators/base.py rename to src/core/tasks/scheduled/templates/operator.py diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index d93143aa..993807fd 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -2,7 +2,7 @@ from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py index 27459145..327c2a9f 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py @@ -1,13 +1,11 @@ -from typing import Any - from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus, CollectorType from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/auto_relevant/core.py b/src/core/tasks/url/operators/auto_relevant/core.py index 1a0c6c13..d696cc31 100644 --- a/src/core/tasks/url/operators/auto_relevant/core.py +++ b/src/core/tasks/url/operators/auto_relevant/core.py @@ -3,8 +3,8 @@ from src.core.tasks.url.operators.auto_relevant.sort import separate_success_and_error_subsets from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.huggingface.inference.models.input import BasicInput diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index b444b5b3..78e4c983 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -7,8 +7,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/record_type/core.py b/src/core/tasks/url/operators/record_type/core.py index ce73ceb4..56abc6fc 100644 --- a/src/core/tasks/url/operators/record_type/core.py +++ b/src/core/tasks/url/operators/record_type/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.record_type.tdo import URLRecordTypeTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/submit_approved_url/core.py b/src/core/tasks/url/operators/submit_approved_url/core.py index dd2df39e..d2e20c3a 100644 --- a/src/core/tasks/url/operators/submit_approved_url/core.py +++ b/src/core/tasks/url/operators/submit_approved_url/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/url_html/core.py index 495845a4..091a1c10 100644 --- a/src/core/tasks/url/operators/url_html/core.py +++ b/src/core/tasks/url/operators/url_html/core.py @@ -1,8 +1,8 @@ from http import HTTPStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.core.tasks.url.operators.url_html.tdo import UrlHtmlTDO diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py index 6af92abe..70d2f6a3 100644 --- a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py @@ -1,8 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.dto_converter import DTOConverter -from src.db.dtos.url.core import URLInfo -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/url_html/tdo.py b/src/core/tasks/url/operators/url_html/tdo.py index 7fe14078..f40c9bc2 100644 --- a/src/core/tasks/url/operators/url_html/tdo.py +++ b/src/core/tasks/url/operators/url_html/tdo.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py b/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py index 988fbe8b..446c32c4 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py +++ b/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py @@ -1,7 +1,7 @@ from typing import Optional from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index c4c9892f..e5add9ce 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -1,12 +1,10 @@ -from typing import Any - from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload from src.collectors.enums import CollectorType from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from src.db.dtos.url.html_content import HTMLContentType -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 45505be5..febab6b3 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -6,7 +6,7 @@ from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, text, Row from sqlalchemy.dialects import postgresql from sqlalchemy.dialects.postgresql import insert as pg_insert -from sqlalchemy.exc import IntegrityError, NoResultFound +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -42,19 +42,29 @@ from src.api.endpoints.review.approve.query import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse +from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo - from src.api.endpoints.task.by_id.query import GetTaskInfoQueryBuilder from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse, GetTasksResponseTaskInfo from src.api.endpoints.url.get.dto import GetURLsResponseInfo - from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder +from src.core.tasks.scheduled.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query +from src.core.tasks.scheduled.sync.agency.queries.update_sync_progress import get_update_agencies_sync_progress_query +from src.core.tasks.scheduled.sync.agency.queries.upsert import \ + convert_agencies_sync_response_to_agencies_upsert +from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.queries.get_sync_params import GetDataSourcesSyncParametersQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query +from src.core.tasks.scheduled.sync.data_sources.queries.update_sync_progress import \ + get_update_data_sources_sync_progress_query +from src.core.tasks.scheduled.sync.data_sources.queries.upsert import convert_data_sources_sync_response_to_url_upsert from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ @@ -76,34 +86,36 @@ from src.db.config_manager import ConfigManager from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.dto_converter import DTOConverter -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo, DuplicateInfo -from src.db.dtos.log import LogInfo, LogOutputInfo -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput -from src.db.dtos.url.core import URLInfo -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.instantiations.log.pydantic.output import LogOutputInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.duplicate import Duplicate +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.link.link_task_url import LinkTaskURL -from src.db.models.instantiations.log import Log +from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.root_url_cache import RootURL -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.error_info import URLErrorInfo +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 @@ -111,19 +123,19 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.templates import Base from src.db.queries.base.builder import QueryBuilderBase -from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder -from src.db.queries.implementations.core.tasks.agency_sync.upsert import get_upsert_agencies_mappings from src.db.statement_composer import StatementComposer +from src.db.templates.upsert import UpsertModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo class AsyncDatabaseClient: @@ -172,8 +184,22 @@ async def add(self, session: AsyncSession, model: Base): session.add(model) @session_manager - async def add_all(self, session: AsyncSession, models: list[Base]): + async def add_all( + self, + session: AsyncSession, + models: list[Base], + return_ids: bool = False + ) -> list[int] | None: session.add_all(models) + if return_ids: + if not hasattr(models[0], "id"): + raise AttributeError("Models must have an id attribute") + await session.flush() + return [ + model.id # pyright: ignore [reportAttributeAccessIssue] + for model in models + ] + return None @session_manager async def bulk_update( @@ -192,21 +218,25 @@ async def bulk_update( async def bulk_upsert( self, session: AsyncSession, - model: Base, - mappings: list[dict], - id_value: str = "id" + models: list[UpsertModel], ): + if len(models) == 0: + return + + first_model = models[0] - query = pg_insert(model) + query = pg_insert(first_model.sa_model) + + mappings = [upsert_model.model_dump() for upsert_model in models] set_ = {} for k, v in mappings[0].items(): - if k == id_value: + if k == first_model.id_field: continue set_[k] = getattr(query.excluded, k) query = query.on_conflict_do_update( - index_elements=[id_value], + index_elements=[first_model.id_field], set_=set_ ) @@ -1566,56 +1596,43 @@ async def get_urls_aggregated_pending_metrics( ) return result - @session_manager - async def get_agencies_sync_parameters( - self, - session: AsyncSession - ) -> AgencySyncParameters: - query = select( - AgenciesSyncState.current_page, - AgenciesSyncState.current_cutoff_date + async def get_agencies_sync_parameters(self) -> AgencySyncParameters: + return await self.run_query_builder( + GetAgenciesSyncParametersQueryBuilder() ) - try: - result = (await session.execute(query)).mappings().one() - return AgencySyncParameters( - page=result['current_page'], - cutoff_date=result['current_cutoff_date'] - ) - except NoResultFound: - # Add value - state = AgenciesSyncState() - session.add(state) - return AgencySyncParameters(page=None, cutoff_date=None) - + async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: + return await self.run_query_builder( + GetDataSourcesSyncParametersQueryBuilder() + ) async def upsert_agencies( self, agencies: list[AgenciesSyncResponseInnerInfo] ): await self.bulk_upsert( - model=Agency, - mappings=get_upsert_agencies_mappings(agencies), - id_value="agency_id", + models=convert_agencies_sync_response_to_agencies_upsert(agencies) ) - async def update_agencies_sync_progress(self, page: int): - query = update( - AgenciesSyncState - ).values( - current_page=page + async def upsert_urls_from_data_sources( + self, + data_sources: list[DataSourcesSyncResponseInnerInfo] + ): + await self.bulk_upsert( + models=convert_data_sources_sync_response_to_url_upsert(data_sources) ) - await self.execute(query) + + async def update_agencies_sync_progress(self, page: int): + await self.execute(get_update_agencies_sync_progress_query(page)) + + async def update_data_sources_sync_progress(self, page: int): + await self.execute(get_update_data_sources_sync_progress_query(page)) + + async def mark_full_data_sources_sync(self): + await self.execute(get_mark_full_data_sources_sync_query()) async def mark_full_agencies_sync(self): - query = update( - AgenciesSyncState - ).values( - last_full_sync_at=func.now(), - current_cutoff_date=func.now() - text('interval \'1 day\''), - current_page=None - ) - await self.execute(query) + await self.execute(get_mark_full_agencies_sync_query()) @session_manager async def get_html_for_url( diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 8ec13085..827d0452 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -7,19 +7,19 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.log import LogInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.templates import Base -from src.db.models.instantiations.duplicate import Duplicate -from src.db.models.instantiations.log import Log +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus diff --git a/src/db/client/types.py b/src/db/client/types.py index 5ee28c10..8b004e19 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 80cbcd93..0b2379ef 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 5397c803..40aa8fa1 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -8,16 +8,15 @@ from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo -from src.db.dtos.url.core import URLInfo from src.db.dtos.url.with_html import URLWithHTML from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.html_content import URLHTMLContent -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/db/dtos/duplicate.py b/src/db/dtos/duplicate.py deleted file mode 100644 index d978f91e..00000000 --- a/src/db/dtos/duplicate.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - - -class DuplicateInsertInfo(BaseModel): - original_url_id: int - duplicate_batch_id: int - -class DuplicateInfo(DuplicateInsertInfo): - source_url: str - original_batch_id: int - duplicate_metadata: dict - original_metadata: dict \ No newline at end of file diff --git a/src/db/dtos/metadata_annotation.py b/src/db/dtos/metadata_annotation.py deleted file mode 100644 index 5a004cf1..00000000 --- a/src/db/dtos/metadata_annotation.py +++ /dev/null @@ -1,11 +0,0 @@ -from datetime import datetime - -from pydantic import BaseModel - - -class MetadataAnnotationInfo(BaseModel): - id: int - user_id: int - metadata_id: int - value: str - created_at: datetime diff --git a/src/db/dtos/url/metadata.py b/src/db/dtos/url/metadata.py deleted file mode 100644 index acac01b8..00000000 --- a/src/db/dtos/url/metadata.py +++ /dev/null @@ -1,19 +0,0 @@ -from datetime import datetime -from typing import Optional - -from pydantic import BaseModel - -from src.db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource - - -class URLMetadataInfo(BaseModel): - id: Optional[int] = None - url_id: Optional[int] = None - attribute: Optional[URLMetadataAttributeType] = None - # TODO: May need to add validation here depending on the type of attribute - value: Optional[str] = None - notes: Optional[str] = None - validation_status: Optional[ValidationStatus] = None - validation_source: Optional[ValidationSource] = None - created_at: Optional[datetime] = None - updated_at: Optional[datetime] = None \ No newline at end of file diff --git a/src/db/enums.py b/src/db/enums.py index 0a45addd..03834e9e 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -42,6 +42,7 @@ class TaskType(PyEnum): IDLE = "Idle" PROBE_404 = "404 Probe" SYNC_AGENCIES = "Sync Agencies" + SYNC_DATA_SOURCES = "Sync Data Sources" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index f72f06ba..62dff0bd 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -7,7 +7,7 @@ def get_created_at_column(): def get_agency_id_foreign_column( nullable: bool = False -): +) -> Column: return Column( 'agency_id', Integer(), diff --git a/src/db/models/instantiations/agency/__init__.py b/src/db/models/instantiations/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/agency/pydantic/__init__.py b/src/db/models/instantiations/agency/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/instantiations/agency/pydantic/upsert.py new file mode 100644 index 00000000..4666a878 --- /dev/null +++ b/src/db/models/instantiations/agency/pydantic/upsert.py @@ -0,0 +1,23 @@ +from datetime import datetime + +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.templates import Base +from src.db.templates.upsert import UpsertModel + + +class AgencyUpsertModel(UpsertModel): + + @property + def id_field(self) -> str: + return "agency_id" + + @property + def sa_model(self) -> type[Base]: + return Agency + + agency_id: int + name: str + state: str | None + county: str | None + locality: str | None + ds_last_updated_at: datetime diff --git a/src/db/models/instantiations/agency.py b/src/db/models/instantiations/agency/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/agency.py rename to src/db/models/instantiations/agency/sqlalchemy.py diff --git a/src/db/models/instantiations/batch/__init__.py b/src/db/models/instantiations/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/batch.py b/src/db/models/instantiations/batch/pydantic.py similarity index 100% rename from src/db/dtos/batch.py rename to src/db/models/instantiations/batch/pydantic.py diff --git a/src/db/models/instantiations/batch.py b/src/db/models/instantiations/batch/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/batch.py rename to src/db/models/instantiations/batch/sqlalchemy.py diff --git a/src/db/models/instantiations/confirmed_url_agency.py b/src/db/models/instantiations/confirmed_url_agency.py index db63b114..b8a50a21 100644 --- a/src/db/models/instantiations/confirmed_url_agency.py +++ b/src/db/models/instantiations/confirmed_url_agency.py @@ -1,5 +1,5 @@ -from sqlalchemy import UniqueConstraint -from sqlalchemy.orm import relationship +from sqlalchemy import UniqueConstraint, Column +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin @@ -9,7 +9,7 @@ class ConfirmedURLAgency(URLDependentMixin, StandardModel): __tablename__ = "confirmed_url_agency" - agency_id = get_agency_id_foreign_column() + agency_id: Mapped[int] = get_agency_id_foreign_column() url = relationship("URL", back_populates="confirmed_agencies") agency = relationship("Agency", back_populates="confirmed_urls") diff --git a/src/db/models/instantiations/duplicate/__init__.py b/src/db/models/instantiations/duplicate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/duplicate/pydantic/__init__.py b/src/db/models/instantiations/duplicate/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/duplicate/pydantic/info.py b/src/db/models/instantiations/duplicate/pydantic/info.py new file mode 100644 index 00000000..3a020e04 --- /dev/null +++ b/src/db/models/instantiations/duplicate/pydantic/info.py @@ -0,0 +1,8 @@ +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo + + +class DuplicateInfo(DuplicateInsertInfo): + source_url: str + original_batch_id: int + duplicate_metadata: dict + original_metadata: dict diff --git a/src/db/models/instantiations/duplicate/pydantic/insert.py b/src/db/models/instantiations/duplicate/pydantic/insert.py new file mode 100644 index 00000000..f753e217 --- /dev/null +++ b/src/db/models/instantiations/duplicate/pydantic/insert.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class DuplicateInsertInfo(BaseModel): + original_url_id: int + duplicate_batch_id: int + diff --git a/src/db/models/instantiations/duplicate.py b/src/db/models/instantiations/duplicate/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/duplicate.py rename to src/db/models/instantiations/duplicate/sqlalchemy.py diff --git a/src/db/models/instantiations/log/__init__.py b/src/db/models/instantiations/log/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/log/pydantic/__init__.py b/src/db/models/instantiations/log/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/log.py b/src/db/models/instantiations/log/pydantic/info.py similarity index 65% rename from src/db/dtos/log.py rename to src/db/models/instantiations/log/pydantic/info.py index 43ed1cec..aa9b06ee 100644 --- a/src/db/dtos/log.py +++ b/src/db/models/instantiations/log/pydantic/info.py @@ -9,8 +9,3 @@ class LogInfo(BaseModel): log: str batch_id: int created_at: Optional[datetime] = None - -class LogOutputInfo(BaseModel): - id: Optional[int] = None - log: str - created_at: Optional[datetime] = None \ No newline at end of file diff --git a/src/db/models/instantiations/log/pydantic/output.py b/src/db/models/instantiations/log/pydantic/output.py new file mode 100644 index 00000000..c58eab0f --- /dev/null +++ b/src/db/models/instantiations/log/pydantic/output.py @@ -0,0 +1,10 @@ +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel + + +class LogOutputInfo(BaseModel): + id: Optional[int] = None + log: str + created_at: Optional[datetime] = None diff --git a/src/db/models/instantiations/log.py b/src/db/models/instantiations/log/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/log.py rename to src/db/models/instantiations/log/sqlalchemy.py diff --git a/src/db/models/instantiations/sync_state/__init__.py b/src/db/models/instantiations/sync_state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/sync_state_agencies.py b/src/db/models/instantiations/sync_state/agencies.py similarity index 100% rename from src/db/models/instantiations/sync_state_agencies.py rename to src/db/models/instantiations/sync_state/agencies.py diff --git a/src/db/models/instantiations/sync_state/data_sources.py b/src/db/models/instantiations/sync_state/data_sources.py new file mode 100644 index 00000000..cf173860 --- /dev/null +++ b/src/db/models/instantiations/sync_state/data_sources.py @@ -0,0 +1,28 @@ +from sqlalchemy import Integer, Column, DateTime, Date + +from src.db.models.templates import Base + + +class DataSourcesSyncState(Base): + __tablename__ = 'data_sources_sync_state' + id = Column(Integer, primary_key=True) + last_full_sync_at = Column( + DateTime(), + nullable=True, + comment="The datetime of the last *full* sync " + "(i.e., the last sync that got all entries " + "available to be synchronized)." + ) + current_cutoff_date = Column( + Date(), + nullable=True, + comment="Tracks the cutoff date passed to the data sources sync endpoint." + "On completion of a full sync, this is set to " + "the day before the present day." + ) + current_page = Column( + Integer(), + nullable=True, + comment="Tracks the current page passed to the data sources sync endpoint." + "On completion of a full sync, this is set to `null`." + ) \ No newline at end of file diff --git a/src/db/models/instantiations/url/core/__init__.py b/src/db/models/instantiations/url/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/core/pydantic/__init__.py b/src/db/models/instantiations/url/core/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/core.py b/src/db/models/instantiations/url/core/pydantic/info.py similarity index 100% rename from src/db/dtos/url/core.py rename to src/db/models/instantiations/url/core/pydantic/info.py diff --git a/src/db/models/instantiations/url/core/pydantic/upsert.py b/src/db/models/instantiations/url/core/pydantic/upsert.py new file mode 100644 index 00000000..368befbd --- /dev/null +++ b/src/db/models/instantiations/url/core/pydantic/upsert.py @@ -0,0 +1,24 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.templates import Base +from src.db.templates.upsert import UpsertModel +from src.db.models.instantiations.url.core.sqlalchemy import URL + + +class URLUpsertModel(UpsertModel): + + @property + def id_field(self) -> str: + return "id" + + @property + def sa_model(self) -> type[Base]: + return URL + + id: int + url: str + name: str + description: str + collector_metadata: dict | None = None + outcome: URLStatus + record_type: RecordType \ No newline at end of file diff --git a/src/db/models/instantiations/url/core.py b/src/db/models/instantiations/url/core/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/core.py rename to src/db/models/instantiations/url/core/sqlalchemy.py diff --git a/src/db/models/instantiations/url/error_info/__init__.py b/src/db/models/instantiations/url/error_info/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/error.py b/src/db/models/instantiations/url/error_info/pydantic.py similarity index 100% rename from src/db/dtos/url/error.py rename to src/db/models/instantiations/url/error_info/pydantic.py diff --git a/src/db/models/instantiations/url/error_info.py b/src/db/models/instantiations/url/error_info/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/error_info.py rename to src/db/models/instantiations/url/error_info/sqlalchemy.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py b/src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py b/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/annotations/auto/relevancy.py b/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py similarity index 100% rename from src/db/dtos/url/annotations/auto/relevancy.py rename to src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto.py b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/auto.py rename to src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index 656b56f3..41a8fc8d 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -18,7 +18,7 @@ from src.collectors.enums import URLStatus from src.db.constants import ALL_ANNOTATION_MODELS -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index 8ac1b4af..bd16f149 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -7,7 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 571db2a0..11a332dd 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -6,8 +6,8 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 503af6c3..5e27496a 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -1,11 +1,11 @@ from typing import Any, Type -from sqlalchemy import select, func, case +from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/db/queries/implementations/core/tasks/agency_sync/upsert.py b/src/db/queries/implementations/core/tasks/agency_sync/upsert.py deleted file mode 100644 index cff2044b..00000000 --- a/src/db/queries/implementations/core/tasks/agency_sync/upsert.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo - - -def get_upsert_agencies_mappings( - agencies: list[AgenciesSyncResponseInnerInfo] -) -> list[dict]: - agency_dicts = [] - for agency in agencies: - agency_dict = { - 'agency_id': agency.agency_id, - 'name': agency.display_name, - 'state': agency.state_name, - 'county': agency.county_name, - 'locality': agency.locality_name, - 'ds_last_updated_at': agency.updated_at - } - agency_dicts.append(agency_dict) - - return agency_dicts \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 9d5faa97..fbdc9511 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -13,8 +13,8 @@ from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.types import UserSuggestionType diff --git a/src/db/templates/__init__.py b/src/db/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/upsert.py b/src/db/templates/upsert.py new file mode 100644 index 00000000..d80de944 --- /dev/null +++ b/src/db/templates/upsert.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod + +from pydantic import BaseModel + +from src.db.models.templates import Base + + +class UpsertModel(BaseModel, ABC): + """An abstract base class for encapsulating upsert operations.""" + + @property + def id_field(self) -> str: + """Defines the field to be used as the primary key.""" + return "id" + + @property + @abstractmethod + def sa_model(self) -> type[Base]: + """Defines the SQLAlchemy model to be upserted.""" + pass \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 126e7970..d0fe5464 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -2,11 +2,13 @@ from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo from src.external.pdap.enums import MatchAgencyResponseStatus @@ -175,4 +177,34 @@ async def sync_agencies( AgenciesSyncResponseInnerInfo(**entry) for entry in response_info.data["agencies"] ] + ) + + async def sync_data_sources( + self, + params: DataSourcesSyncParameters + ) -> DataSourcesSyncResponseInfo: + url = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=[ + "data-sources", + "sync" + ] + ) + headers = await self.access_manager.jwt_header() + headers['Content-Type'] = "application/json" + request_info = RequestInfo( + type_=RequestType.GET, + url=url, + headers=headers, + params={ + "page": params.page, + "update_at": params.cutoff_date + } + ) + response_info = await self.access_manager.make_request(request_info) + return DataSourcesSyncResponseInfo( + data_sources=[ + DataSourcesSyncResponseInnerInfo(**entry) + for entry in response_info.data["data_sources"] + ] ) \ No newline at end of file diff --git a/src/external/pdap/dtos/sync/__init__.py b/src/external/pdap/dtos/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/dtos/agencies_sync.py b/src/external/pdap/dtos/sync/agencies.py similarity index 100% rename from src/external/pdap/dtos/agencies_sync.py rename to src/external/pdap/dtos/sync/agencies.py diff --git a/src/external/pdap/dtos/sync/data_sources.py b/src/external/pdap/dtos/sync/data_sources.py new file mode 100644 index 00000000..b7e275e9 --- /dev/null +++ b/src/external/pdap/dtos/sync/data_sources.py @@ -0,0 +1,21 @@ +from datetime import datetime + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus + + +class DataSourcesSyncResponseInnerInfo(BaseModel): + id: int + url: str + name: str + description: str + record_type: RecordType + agency_ids: list[int] + approval_status: ApprovalStatus + url_status: DataSourcesURLStatus + updated_at: datetime + +class DataSourcesSyncResponseInfo(BaseModel): + data_sources: list[DataSourcesSyncResponseInnerInfo] \ No newline at end of file diff --git a/src/external/pdap/enums.py b/src/external/pdap/enums.py index 36111acd..c532f820 100644 --- a/src/external/pdap/enums.py +++ b/src/external/pdap/enums.py @@ -12,3 +12,9 @@ class ApprovalStatus(Enum): REJECTED = "rejected" PENDING = "pending" NEEDS_IDENTIFICATION = "needs identification" + +class DataSourcesURLStatus(Enum): + AVAILABLE = "available" + BROKEN = "broken" + OK = "ok" + NONE_FOUND = "none found" \ No newline at end of file diff --git a/tests/automated/integration/api/review/rejection/helpers.py b/tests/automated/integration/api/review/rejection/helpers.py index 8fb26603..1e825694 100644 --- a/tests/automated/integration/api/review/rejection/helpers.py +++ b/tests/automated/integration/api/review/rejection/helpers.py @@ -2,7 +2,7 @@ from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.review.reject.dto import FinalReviewRejectionInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index 9afc16d8..9b51311a 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -5,9 +5,9 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py index eea90bf2..07408ff0 100644 --- a/tests/automated/integration/api/test_batch.py +++ b/tests/automated/integration/api/test_batch.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType, URLStatus diff --git a/tests/automated/integration/api/test_example_collector.py b/tests/automated/integration/api/test_example_collector.py index 1e20362d..2903c528 100644 --- a/tests/automated/integration/api/test_example_collector.py +++ b/tests/automated/integration/api/test_example_collector.py @@ -7,7 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO from src.collectors.source_collectors.example.core import ExampleCollector from src.collectors.enums import CollectorType diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index a7be37e4..8f51ab9c 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -4,8 +4,8 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType from src.core.enums import RecordType diff --git a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py index 33a93998..37ed6462 100644 --- a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py +++ b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py @@ -1,7 +1,7 @@ import pytest from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 590f9cd1..90b52db4 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/db/client/test_add_url_error_info.py b/tests/automated/integration/db/client/test_add_url_error_info.py index 34d103ce..3bb25e58 100644 --- a/tests/automated/integration/db/client/test_add_url_error_info.py +++ b/tests/automated/integration/db/client/test_add_url_error_info.py @@ -1,7 +1,7 @@ import pytest from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_old_logs.py b/tests/automated/integration/db/client/test_delete_old_logs.py index d451af8f..1a5b0cd7 100644 --- a/tests/automated/integration/db/client/test_delete_old_logs.py +++ b/tests/automated/integration/db/client/test_delete_old_logs.py @@ -2,7 +2,7 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index a6ca731b..d923d770 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_logs.py b/tests/automated/integration/db/client/test_insert_logs.py index d752c894..6da198d8 100644 --- a/tests/automated/integration/db/client/test_insert_logs.py +++ b/tests/automated/integration/db/client/test_insert_logs.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 73a88d02..2f304219 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,8 +1,8 @@ import pytest from src.core.enums import BatchStatus -from src.db.dtos.batch import BatchInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.mark.asyncio diff --git a/tests/automated/integration/db/test_database_structure.py b/tests/automated/integration/db/test_database_structure.py index 7b34cebb..4b73bd3d 100644 --- a/tests/automated/integration/db/test_database_structure.py +++ b/tests/automated/integration/db/test_database_structure.py @@ -20,7 +20,7 @@ from src.db.dtos.url.insert import InsertURLsInfo from src.db.enums import URLHTMLContentType from src.db.helpers import get_postgres_connection_string -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus, SuggestionType from src.db.models.templates import Base diff --git a/tests/automated/integration/tasks/scheduled/sync/__init__.py b/tests/automated/integration/tasks/scheduled/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/agency/__init__.py b/tests/automated/integration/tasks/scheduled/sync/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/conftest.py b/tests/automated/integration/tasks/scheduled/sync/agency/conftest.py similarity index 74% rename from tests/automated/integration/tasks/scheduled/agency_sync/conftest.py rename to tests/automated/integration/tasks/scheduled/sync/agency/conftest.py index b621250f..8ba4221f 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/conftest.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/conftest.py @@ -1,7 +1,7 @@ import pytest_asyncio -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import update_existing_agencies_updated_at, \ +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import update_existing_agencies_updated_at, \ add_existing_agencies @pytest_asyncio.fixture diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/data.py b/tests/automated/integration/tasks/scheduled/sync/agency/data.py similarity index 97% rename from tests/automated/integration/tasks/scheduled/agency_sync/data.py rename to tests/automated/integration/tasks/scheduled/sync/agency/data.py index fa06ea33..d3227393 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/data.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/data.py @@ -1,6 +1,6 @@ from datetime import datetime -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo PREEXISTING_AGENCY_1 = AgenciesSyncResponseInnerInfo( display_name="Preexisting Agency 1", diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py similarity index 80% rename from tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py rename to tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py index 150df5b0..292f4aea 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py @@ -1,6 +1,6 @@ -from src.db.models.instantiations.agency import Agency -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.agency_sync.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.agency.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE class AgencyChecker: diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/helpers.py b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py similarity index 92% rename from tests/automated/integration/tasks/scheduled/agency_sync/helpers.py rename to tests/automated/integration/tasks/scheduled/sync/agency/helpers.py index c05e61f7..593ec1e1 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/helpers.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py @@ -5,10 +5,10 @@ from sqlalchemy import select, func, TIMESTAMP, cast from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.scheduled.agency_sync.data import PREEXISTING_AGENCIES +from tests.automated.integration.tasks.scheduled.sync.agency.data import PREEXISTING_AGENCIES async def check_sync_concluded( diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py similarity index 77% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py index 863acf5c..c7d6bca7 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py @@ -3,12 +3,12 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.db.models.instantiations.agency import Agency -from tests.automated.integration.tasks.scheduled.agency_sync.data import AGENCIES_SYNC_RESPONSES -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import check_sync_concluded, patch_sync_agencies +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.automated.integration.tasks.scheduled.sync.agency.data import AGENCIES_SYNC_RESPONSES +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import check_sync_concluded, patch_sync_agencies from tests.helpers.asserts import assert_task_run_success diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py similarity index 84% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py index f11e4e1f..41f4b86c 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py @@ -1,14 +1,14 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.agency_sync.data import FIRST_CALL_RESPONSE, \ +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from tests.automated.integration.tasks.scheduled.sync.agency.data import FIRST_CALL_RESPONSE, \ THIRD_CALL_RESPONSE, SECOND_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import patch_sync_agencies, check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import patch_sync_agencies, check_sync_concluded @pytest.mark.asyncio diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py similarity index 74% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py index fcc353ef..20a179bd 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py @@ -4,13 +4,13 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.agency_sync.data import THIRD_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import patch_sync_agencies, check_sync_concluded +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from tests.automated.integration.tasks.scheduled.sync.agency.data import THIRD_CALL_RESPONSE +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import patch_sync_agencies, check_sync_concluded from tests.helpers.asserts import assert_task_run_success diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py new file mode 100644 index 00000000..67019539 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py @@ -0,0 +1,11 @@ +import pytest_asyncio + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator + + +@pytest_asyncio.fixture +async def setup( + db_data_creator, + mock_pdap_client +) -> SyncDataSourcesTaskOperator: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py new file mode 100644 index 00000000..abf88b86 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py @@ -0,0 +1,2 @@ +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo + diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py new file mode 100644 index 00000000..c9ea857c --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -0,0 +1,5 @@ + + +class URLExistenceChecker: + def __init__(self): + self._dict = {"url": url} \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py new file mode 100644 index 00000000..5996fc4f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py @@ -0,0 +1,131 @@ +from contextlib import contextmanager +from datetime import datetime +from unittest.mock import patch + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source import URLDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.info import TestDataSourcesSyncSetupInfo +from tests.helpers.db_data_creator import DBDataCreator + + +async def setup_data( + db_data_creator: DBDataCreator, + mock_pdap_client: PDAPClient +) -> TestDataSourcesSyncSetupInfo: + adb_client = db_data_creator.adb_client + + agency_id_preexisting_urls = await db_data_creator.agency() + agency_id_new_urls = await db_data_creator.agency() + + # Setup data sources + + + # Setup pre-existing urls + preexisting_urls = [ + URL( + url='https://example.com/1', + name='Pre-existing URL 1', + description='Pre-existing URL 1 Description', + collector_metadata={}, + outcome=URLStatus.PENDING.value, + record_type=RecordType.ACCIDENT_REPORTS.value, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + ), + URL( + url='https://example.com/2', + name='Pre-existing URL 2', + description='Pre-existing URL 2 Description', + collector_metadata={}, + outcome=URLStatus.VALIDATED.value, + record_type=RecordType.ACCIDENT_REPORTS.value, + updated_at=datetime(2025, 10, 17, 3, 0, 0), + ), + ] + preexisting_url_ids = await adb_client.add_all(preexisting_urls, return_ids=True) + # Link second pre-existing url to data source + await adb_client.add(URLDataSource( + url_id=preexisting_url_ids[1], + data_source_id=preexisting_url_ids[1] + )) + + # Link second pre-existing url to agency + await adb_client.add(ConfirmedURLAgency( + url_id=preexisting_url_ids[1], + agency_id=agency_id_preexisting_urls + )) + + + first_call_response = DataSourcesSyncResponseInfo( + data_sources=[ + DataSourcesSyncResponseInnerInfo( + id=120, + url="https://newurl.com/1", + name="New URL 1", + description="New URL 1 Description", + approval_status=ApprovalStatus.APPROVED, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.ACCIDENT_REPORTS.value, + agency_ids=[agency_id_new_urls], + ), + DataSourcesSyncResponseInnerInfo( + id=121, + url="https://newurl.com/2", + name="New URL 2", + description="New URL 2 Description", + approval_status=ApprovalStatus.APPROVED, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.ACCIDENT_REPORTS.value, + agency_ids=[agency_id_new_urls], + ), + DataSourcesSyncResponseInnerInfo( + id=122, + url="https://newurl.com/3", + name="New URL 3", + description="New URL 3 Description", + approval_status=ApprovalStatus.APPROVED, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.ACCIDENT_REPORTS.value, + agency_ids=[agency_id_new_urls], + ), + DataSourcesSyncResponseInnerInfo( + id=123, + url="https://newurl.com/4", + name="New URL 4", + description="New URL 4 Description", + approval_status=ApprovalStatus.APPROVED, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.ACCIDENT_REPORTS.value, + agency_ids=[agency_id_new_urls], + ), + DataSourcesSyncResponseInnerInfo( + id=preexisting_url_ids[0], + url="https://newurl.com/5", + name="Updated Preexisting URL 1", + description="Updated Preexisting URL 1 Description", + approval_status=ApprovalStatus.APPROVED, + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.ACCIDENT_REPORTS.value, + agency_ids=[agency_id_preexisting_urls, agency_id_new_urls], + ] + + ) + + + + + + +@contextmanager +def patch_sync_data_sources(side_effects: list): + with patch.object( + PDAPClient, + "sync_data_sources", + side_effect=side_effects + ): + yield \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py new file mode 100644 index 00000000..00c0b51e --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel + +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo + + +class TestDataSourcesSyncSetupInfo(BaseModel): + + class Config: + allow_arbitrary_types = True + + preexisting_urls: list[URL] + preexisting_urls_ids: list[int] + first_call_response: DataSourcesSyncResponseInfo + second_call_response: DataSourcesSyncResponseInfo + third_call_response: DataSourcesSyncResponseInfo \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/auto_relevant/test_task.py index 287b5f13..6458c8a9 100644 --- a/tests/automated/integration/tasks/url/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/auto_relevant/test_task.py @@ -1,9 +1,9 @@ import pytest from src.db.enums import TaskType -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.error_info import URLErrorInfo -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_task_has_expected_run_info, \ assert_prereqs_met from tests.automated.integration.tasks.url.auto_relevant.setup import setup_operator, setup_urls diff --git a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py index cb46c845..1ded4ba5 100644 --- a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py +++ b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py @@ -6,7 +6,7 @@ from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from tests.automated.integration.tasks.url.duplicate.constants import BATCH_CREATION_PARAMETERS diff --git a/tests/automated/integration/tasks/url/test_agency_preannotation_task.py b/tests/automated/integration/tasks/url/test_agency_preannotation_task.py index 03961fe0..f7b75f51 100644 --- a/tests/automated/integration/tasks/url/test_agency_preannotation_task.py +++ b/tests/automated/integration/tasks/url/test_agency_preannotation_task.py @@ -14,7 +14,7 @@ from src.external.pdap.enums import MatchAgencyResponseStatus from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.collectors.enums import CollectorType, URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.subtasks.agency_identification.auto_googler import AutoGooglerAgencyIdentificationSubtask diff --git a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py index 0bdc3718..3b3dd163 100644 --- a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py @@ -7,9 +7,9 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType -from src.db.models.instantiations.url.error_info import URLErrorInfo +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.enums import RecordType, SubmitResponseStatus diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 7a88f759..2cc8294f 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo diff --git a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py index e3d7c529..e9f55240 100644 --- a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py @@ -4,7 +4,7 @@ from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/unit/core/test_core_logger.py b/tests/automated/unit/core/test_core_logger.py index f6738011..580f18bd 100644 --- a/tests/automated/unit/core/test_core_logger.py +++ b/tests/automated/unit/core/test_core_logger.py @@ -3,7 +3,7 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 96fbf8c4..22770205 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,7 +5,7 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 070f9533..c54e624e 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,7 +4,7 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index b3e9fec1..863e614b 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,7 +6,7 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO diff --git a/tests/helpers/db_data_creator.py b/tests/helpers/db_data_creator.py index 1a1d0a70..1f91bb05 100644 --- a/tests/helpers/db_data_creator.py +++ b/tests/helpers/db_data_creator.py @@ -9,13 +9,13 @@ from src.api.endpoints.review.enums import RejectionReason from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.client.sync import DatabaseClient from src.db.dtos.url.raw_html import RawHTMLInfo diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 1741253b..a6bf5234 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL async def populate_database(adb_client: AsyncDatabaseClient) -> None: diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index ae78c5dd..0536a1d9 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,7 +2,7 @@ import dotenv -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index d6f10064..37e71666 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,4 +1,4 @@ -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from src.collectors.source_collectors.ckan import group_search, package_search, organization_search diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 772d4d4a..2e4e0227 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,4 +1,4 @@ -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/external/pdap/test_sync_agencies.py b/tests/manual/external/pdap/test_sync_agencies.py index 6d070977..6eeaf7c3 100644 --- a/tests/manual/external/pdap/test_sync_agencies.py +++ b/tests/manual/external/pdap/test_sync_agencies.py @@ -1,7 +1,7 @@ import pytest import time -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters @pytest.mark.asyncio diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 251d123c..7cf002f6 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.db_data_creator import DBDataCreator URLS = [ From 241113e3816f0f51d3732f361009f8998d44a87e Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 22 Jul 2025 17:29:51 -0400 Subject: [PATCH 2/6] Continue draft on agencies sync logic --- .../scheduled/sync/data_sources/check.py | 41 +++++ .../tasks/scheduled/sync/data_sources/data.py | 2 - .../sync/data_sources/existence_checker.py | 41 ++++- .../scheduled/sync/data_sources/setup/core.py | 82 ++++++++-- .../scheduled/sync/data_sources/setup/data.py | 153 ++++++++++++++++++ .../scheduled/sync/data_sources/setup/info.py | 14 +- .../sync/data_sources/test_happy_path.py | 55 +++++++ 7 files changed, 373 insertions(+), 15 deletions(-) create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/check.py delete mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/data.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py new file mode 100644 index 00000000..5968831f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py @@ -0,0 +1,41 @@ +from datetime import timedelta + +from sqlalchemy import select, cast, func, TIMESTAMP + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from src.db.models.instantiations.url.core.sqlalchemy import URL + + +async def check_sync_concluded( + db_client: AsyncDatabaseClient, + check_updated_at: bool = True +): + + current_db_datetime = await db_client.scalar( + select( + cast(func.now(), TIMESTAMP) + ) + ) + + sync_state_results = await db_client.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at > current_db_datetime - timedelta(minutes=5) + assert sync_state_results.current_cutoff_date > (current_db_datetime - timedelta(days=2)).date() + + if not check_updated_at: + return + + updated_ats = await db_client.scalars( + select( + URL.updated_at + ) + ) + assert all( + updated_at > current_db_datetime - timedelta(minutes=5) + for updated_at in updated_ats + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py deleted file mode 100644 index abf88b86..00000000 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/data.py +++ /dev/null @@ -1,2 +0,0 @@ -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo - diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py index c9ea857c..3e4cc3c5 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -1,5 +1,42 @@ +from collections import defaultdict + +from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source import URLDataSource +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo class URLExistenceChecker: - def __init__(self): - self._dict = {"url": url} \ No newline at end of file + + def __init__( + self, + responses: list[DataSourcesSyncResponseInfo], + url_ds_links: list[URLDataSource], + url_agency_links: list[ConfirmedURLAgency] + ): + self._ds_id_response_dict: dict[int, DataSourcesSyncResponseInnerInfo] = {} + for response in responses: + for data_source in response.data_sources: + self._ds_id_response_dict[data_source.id] = data_source + self._ds_id_url_link_dict = {} + for link in url_ds_links: + self._ds_id_url_link_dict[link.data_source_id] = link.url_id + self._url_id_agency_link_dict = defaultdict(list) + for link in url_agency_links: + self._url_id_agency_link_dict[link.url_id].append(link.agency_id) + + + def check(self, url: URL): + ds_id = self._ds_id_url_link_dict.get(url.id) + if ds_id is None: + raise AssertionError(f"URL {url.id} has no data source link") + response = self._ds_id_response_dict.get(ds_id) + if response is None: + raise AssertionError(f"Data source {ds_id} has no response") + + assert response.url == url.url + assert response.description == url.description + assert response.name == url.name + + agency_ids = self._url_id_agency_link_dict.get(url.id) + assert set(response.agency_ids) == set(agency_ids) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py index 5996fc4f..936d935e 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py @@ -2,14 +2,19 @@ from datetime import datetime from unittest.mock import patch +from pydantic import BaseModel + from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource from src.external.pdap.client import PDAPClient from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import ApprovalStatus +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import TestURLSetupEntry, \ + SyncResponseOrder, TestURLPostSetupRecord, AgencyAssigned from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.info import TestDataSourcesSyncSetupInfo from tests.helpers.db_data_creator import DBDataCreator @@ -70,8 +75,9 @@ async def setup_data( description="New URL 1 Description", approval_status=ApprovalStatus.APPROVED, updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS.value, + record_type=RecordType.ACCIDENT_REPORTS, agency_ids=[agency_id_new_urls], + url_status=DataSourcesURLStatus.OK ), DataSourcesSyncResponseInnerInfo( id=121, @@ -80,45 +86,101 @@ async def setup_data( description="New URL 2 Description", approval_status=ApprovalStatus.APPROVED, updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS.value, + record_type=RecordType.FIELD_CONTACTS, agency_ids=[agency_id_new_urls], + url_status=DataSourcesURLStatus.BROKEN ), DataSourcesSyncResponseInnerInfo( id=122, url="https://newurl.com/3", name="New URL 3", description="New URL 3 Description", - approval_status=ApprovalStatus.APPROVED, + approval_status=ApprovalStatus.PENDING, updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS.value, + record_type=RecordType.WANTED_PERSONS, agency_ids=[agency_id_new_urls], + url_status=DataSourcesURLStatus.OK ), DataSourcesSyncResponseInnerInfo( id=123, url="https://newurl.com/4", name="New URL 4", description="New URL 4 Description", - approval_status=ApprovalStatus.APPROVED, + approval_status=ApprovalStatus.NEEDS_IDENTIFICATION, updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS.value, + record_type=RecordType.STOPS, agency_ids=[agency_id_new_urls], + url_status=DataSourcesURLStatus.OK ), DataSourcesSyncResponseInnerInfo( id=preexisting_url_ids[0], url="https://newurl.com/5", name="Updated Preexisting URL 1", description="Updated Preexisting URL 1 Description", - approval_status=ApprovalStatus.APPROVED, + approval_status=ApprovalStatus.REJECTED, # Status should update to rejected. updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS.value, + record_type=RecordType.BOOKING_REPORTS, agency_ids=[agency_id_preexisting_urls, agency_id_new_urls], + url_status=DataSourcesURLStatus.OK + ) + ] + ) + second_call_response = DataSourcesSyncResponseInfo( + data_sources=[ + DataSourcesSyncResponseInnerInfo( + id=preexisting_url_ids[1], + url="https://newurl.com/6", + name="Updated Preexisting URL 2", + description="Updated Preexisting URL 2 Description", + approval_status=ApprovalStatus.APPROVED, # SC should stay validated + updated_at=datetime(2023, 1, 1, 0, 0, 0), + record_type=RecordType.PERSONNEL_RECORDS, + agency_ids=[agency_id_new_urls], + url_status=DataSourcesURLStatus.OK + ), ] - ) + third_call_response = DataSourcesSyncResponseInfo(data_sources=[]) + +class DataSourcesSyncTestSetupManager: + def __init__( + self, + adb_client: AsyncDatabaseClient, + entries: list[TestURLSetupEntry] + ): + self.adb_client = adb_client + self.entries = entries + + self.response_dict: dict[ + SyncResponseOrder, list[DataSourcesSyncResponseInfo] + ] = { + e: [] for e in SyncResponseOrder + } + self.test_agency_dict: dict[ + AgencyAssigned, int + ] = {} + + async def setup(self): + await self.setup_agencies() + + async def setup_entries(self): + for entry in self.entries: + await self.setup_entry(entry) + + async def setup_entry( + self, + entry: TestURLSetupEntry + ) -> TestURLPostSetupRecord: + if entry.sc_info is not None: + # TODO: Add SC entry + raise NotImplementedError() + if entry.ds_info is not None: + # TODO: Add DS entry + raise NotImplementedError() @contextmanager diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py new file mode 100644 index 00000000..d947e061 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py @@ -0,0 +1,153 @@ +from enum import Enum + +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus + +class SyncResponseOrder(Enum): + """Represents which sync response the entry is in.""" + FIRST = 1 + SECOND = 2 + # No entries should be in 3 + THIRD = 3 + +class AgencyAssigned(Enum): + """Represents which of several pre-created agencies the entry is assigned to.""" + ONE = 1 + TWO = 2 + THREE = 3 + +class TestDSURLSetupEntry(BaseModel): + """Represents URL previously existing in DS DB. + + These values should overwrite any SC values + """ + id: int # ID of URL in DS App + name: str + description: str + url_status: DataSourcesURLStatus + approval_status: ApprovalStatus + record_type: RecordType + agency_ids: list[AgencyAssigned] + sync_response_order: SyncResponseOrder + +class TestSCURLSetupEntry(BaseModel): + """Represents URL previously existing in SC DB. + + These values should be overridden by any DS values + """ + name: str + description: str + record_type: RecordType + url_status: URLStatus + agency_ids: list[AgencyAssigned] + +class TestURLSetupEntry(BaseModel): + url: str + ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB + sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB + + final_status: URLStatus + +ENTRIES = [ + TestURLSetupEntry( + # A URL in both DBs that should be overwritten + url='https://example.com/1', + ds_info=TestDSURLSetupEntry( + id=100, + name='Overwritten URL 1 Name', + description='Overwritten URL 1 Description', + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=[AgencyAssigned.ONE, AgencyAssigned.TWO], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 1', + description='Pre-existing URL 1 Description', + record_type=RecordType.ACCIDENT_REPORTS, + url_status=URLStatus.PENDING, + agency_ids=[AgencyAssigned.ONE, AgencyAssigned.THREE] + ), + final_status=URLStatus.VALIDATED + ), + TestURLSetupEntry( + # A DS-only approved but broken URL + url='https://example.com/2', + ds_info=TestDSURLSetupEntry( + id=101, + name='New URL 2 Name', + description='New URL 2 Description', + url_status=DataSourcesURLStatus.BROKEN, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.INCARCERATION_RECORDS, + agency_ids=[AgencyAssigned.TWO], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=None, + final_status=URLStatus.NOT_FOUND + ), + TestURLSetupEntry( + # An SC-only pending URL, should be unchanged. + url='https://example.com/3', + ds_info=None, + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 3 Name', + description='Pre-existing URL 3 Description', + record_type=RecordType.FIELD_CONTACTS, + url_status=URLStatus.PENDING, + agency_ids=[AgencyAssigned.ONE, AgencyAssigned.THREE] + ), + final_status=URLStatus.PENDING + ), + TestURLSetupEntry( + # A DS-only rejected URL + url='https://example.com/4', + ds_info=TestDSURLSetupEntry( + id=102, + name='New URL 4 Name', + description='New URL 4 Description', + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.REJECTED, + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=[AgencyAssigned.ONE], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=None, + final_status=URLStatus.NOT_RELEVANT + ), + TestURLSetupEntry( + # A pre-existing URL in the second response + url='https://example.com/5', + ds_info=TestDSURLSetupEntry( + id=103, + name='New URL 5 Name', + description='New URL 5 Description', + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=[AgencyAssigned.ONE], + sync_response_order=SyncResponseOrder.SECOND + ), + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 5 Name', + description='Pre-existing URL 5 Description', + record_type=RecordType.ACCIDENT_REPORTS, + url_status=URLStatus.PENDING, + agency_ids=[] + ), + final_status=URLStatus.VALIDATED + + ) +] + +class TestURLPostSetupRecord(BaseModel): + """Stores a setup entry along with relevant database-generated ids""" + url_id: int + sc_setup_entry: TestSCURLSetupEntry | None + ds_setup_entry: TestDSURLSetupEntry | None + sc_agency_ids: list[int] | None + ds_agency_ids: list[int] | None \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py index 00c0b51e..f16bdfa7 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py @@ -1,5 +1,7 @@ from pydantic import BaseModel +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo @@ -9,8 +11,18 @@ class TestDataSourcesSyncSetupInfo(BaseModel): class Config: allow_arbitrary_types = True + operator: SyncDataSourcesTaskOperator + db_client: AsyncDatabaseClient preexisting_urls: list[URL] preexisting_urls_ids: list[int] first_call_response: DataSourcesSyncResponseInfo second_call_response: DataSourcesSyncResponseInfo - third_call_response: DataSourcesSyncResponseInfo \ No newline at end of file + third_call_response: DataSourcesSyncResponseInfo + + @property + def data_sources_sync_response(self) -> list[DataSourcesSyncResponseInfo]: + return [ + self.first_call_response, + self.second_call_response, + self.third_call_response + ] \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py index e69de29b..59594923 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py @@ -0,0 +1,55 @@ +from unittest.mock import MagicMock, call + +import pytest + +from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.db.models.instantiations.url.core.sqlalchemy import URL +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.info import TestDataSourcesSyncSetupInfo +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_data_sources_sync_happy_path( + setup: TestDataSourcesSyncSetupInfo +): + operator = setup.operator + adb_client = operator.adb_client + + with patch_sync_data_sources([setup.first_call_response, setup.second_call_response, setup.third_call_response]): + run_info = await operator.run_task(1) + assert_task_run_success(run_info) + mock_func: MagicMock = operator.pdap_client.sync_data_sources + + mock_func.assert_has_calls( + [ + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=1 + ) + ), + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=2 + ) + ), + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=3 + ) + ) + ] + ) + await check_sync_concluded(adb_client, check_updated_at=False) + + # Check six URLs in database + urls: list[URL] = await adb_client.get_all(URL) + assert len(urls) == 6 + + checker = URLChecker() + for url in urls: + checker.check_url(url) From 72f03a0ce68e39af6070ee6043aabb7fd3672302 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 23 Jul 2025 20:50:26 -0400 Subject: [PATCH 3/6] Continue draft --- ...1bab33_setup_for_sync_data_sources_task.py | 205 ++++++++++- .../agency/get/queries/next_for_annotation.py | 8 +- src/api/endpoints/review/approve/query.py | 4 +- src/api/endpoints/review/next/query.py | 4 +- src/core/core.py | 1 - .../data_sources/queries/mark_full_sync.py | 4 +- .../sync/data_sources/queries/upsert.py | 7 +- src/db/client/async_.py | 39 +- src/db/client/sync.py | 2 +- src/db/dto_converter.py | 6 +- src/db/enums.py | 6 + src/db/models/helpers.py | 19 +- .../instantiations/agency/sqlalchemy.py | 8 +- .../models/instantiations/backlog_snapshot.py | 4 +- .../models/instantiations/batch/sqlalchemy.py | 4 +- src/db/models/instantiations/change_log.py | 19 + .../instantiations/confirmed_url_agency.py | 6 +- .../instantiations/duplicate/sqlalchemy.py | 4 +- .../instantiations/link/link_batch_urls.py | 4 +- .../models/instantiations/log/sqlalchemy.py | 4 +- src/db/models/instantiations/missing.py | 4 +- .../models/instantiations/root_url_cache.py | 4 +- src/db/models/instantiations/task/core.py | 4 +- src/db/models/instantiations/task/error.py | 4 +- .../url/checked_for_duplicate.py | 4 +- .../instantiations/url/compressed_html.py | 4 +- .../url/core/pydantic/upsert.py | 1 - .../instantiations/url/core/sqlalchemy.py | 34 +- .../models/instantiations/url/data_source.py | 4 +- .../url/error_info/sqlalchemy.py | 4 +- .../models/instantiations/url/html_content.py | 4 +- .../url/optional_data_source_metadata.py | 4 +- .../instantiations/url/probed_for_404.py | 4 +- .../instantiations/url/reviewing_user.py | 4 +- .../url/suggestion/agency/auto.py | 4 +- .../url/suggestion/agency/user.py | 4 +- .../url/suggestion/record_type/auto.py | 4 +- .../url/suggestion/record_type/user.py | 4 +- .../suggestion/relevant/auto/sqlalchemy.py | 4 +- .../url/suggestion/relevant/user.py | 4 +- src/db/models/templates.py | 2 +- src/db/statement_composer.py | 4 +- .../api/review/rejection/helpers.py | 2 +- .../test_approve_and_get_next_source.py | 8 +- .../db/client/approve_url/test_basic.py | 8 +- .../integration/db/structure/README.md | 6 + .../integration/db/structure/__init__.py | 0 .../integration/db/structure/test_batch.py | 88 +++++ .../db/structure/test_html_content.py | 38 ++ .../integration/db/structure/test_root_url.py | 32 ++ .../db/structure/test_upsert_new_agencies.py | 59 +++ .../integration/db/structure/test_url.py | 45 +++ .../db/structure/testers/__init__.py | 0 .../db/structure/testers/models/__init__.py | 0 .../db/structure/testers/models/column.py | 10 + .../structure/testers/models/foreign_key.py | 8 + .../testers/models/unique_constraint.py | 6 + .../integration/db/structure/testers/table.py | 95 +++++ .../integration/db/structure/types.py | 10 + .../integration/db/test_change_log.py | 96 +++++ .../integration/db/test_database_structure.py | 348 ------------------ .../tasks/scheduled/sync/agency/helpers.py | 21 +- .../scheduled/sync/data_sources/conftest.py | 13 +- .../sync/data_sources/existence_checker.py | 4 +- .../scheduled/sync/data_sources/setup/core.py | 179 --------- .../scheduled/sync/data_sources/setup/data.py | 88 +---- .../sync/data_sources/setup/enums.py | 16 + .../scheduled/sync/data_sources/setup/info.py | 28 -- .../data_sources/setup/manager/__init__.py | 0 .../sync/data_sources/setup/manager/agency.py | 31 ++ .../sync/data_sources/setup/manager/core.py | 96 +++++ .../setup/manager/queries/__init__.py | 0 .../setup/manager/queries/check.py | 50 +++ .../sync/data_sources/setup/manager/url.py | 95 +++++ .../data_sources/setup/models/__init__.py | 0 .../data_sources/setup/models/url/__init__.py | 0 .../data_sources/setup/models/url/core.py | 14 + .../setup/models/url/data_sources.py | 20 + .../data_sources/setup/models/url/post.py | 50 +++ .../setup/models/url/source_collector.py | 17 + .../sync/data_sources/test_happy_path.py | 39 +- .../tasks/url/auto_relevant/test_task.py | 8 +- .../url/duplicate/test_url_duplicate_task.py | 2 +- .../url/test_submit_approved_url_task.py | 6 +- .../tasks/url/test_url_404_probe.py | 8 +- 85 files changed, 1327 insertions(+), 788 deletions(-) create mode 100644 src/db/models/instantiations/change_log.py create mode 100644 tests/automated/integration/db/structure/README.md create mode 100644 tests/automated/integration/db/structure/__init__.py create mode 100644 tests/automated/integration/db/structure/test_batch.py create mode 100644 tests/automated/integration/db/structure/test_html_content.py create mode 100644 tests/automated/integration/db/structure/test_root_url.py create mode 100644 tests/automated/integration/db/structure/test_upsert_new_agencies.py create mode 100644 tests/automated/integration/db/structure/test_url.py create mode 100644 tests/automated/integration/db/structure/testers/__init__.py create mode 100644 tests/automated/integration/db/structure/testers/models/__init__.py create mode 100644 tests/automated/integration/db/structure/testers/models/column.py create mode 100644 tests/automated/integration/db/structure/testers/models/foreign_key.py create mode 100644 tests/automated/integration/db/structure/testers/models/unique_constraint.py create mode 100644 tests/automated/integration/db/structure/testers/table.py create mode 100644 tests/automated/integration/db/structure/types.py create mode 100644 tests/automated/integration/db/test_change_log.py delete mode 100644 tests/automated/integration/db/test_database_structure.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py delete mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py create mode 100644 tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py diff --git a/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py index 07a51dc4..9e990bc1 100644 --- a/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py +++ b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py @@ -9,6 +9,7 @@ from alembic import op import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB from src.util.alembic_helpers import switch_enum_type, id_column @@ -21,6 +22,143 @@ SYNC_STATE_TABLE_NAME = "data_sources_sync_state" URL_DATA_SOURCES_METADATA_TABLE_NAME = "url_data_sources_metadata" +CONFIRMED_AGENCY_TABLE_NAME = "confirmed_url_agency" +LINK_URLS_AGENCIES_TABLE_NAME = "link_urls_agencies" +CHANGE_LOG_TABLE_NAME = "change_log" + +AGENCIES_TABLE_NAME = "agencies" + +TABLES_TO_LOG = [ + LINK_URLS_AGENCIES_TABLE_NAME, + "urls", + "url_data_sources", + "agencies", +] + +OperationTypeEnum = sa.Enum("UPDATE", "DELETE", "INSERT", name="operation_type") + + +def upgrade() -> None: + _create_data_sources_sync_state_table() + _create_data_sources_sync_task() + + _rename_confirmed_url_agency_to_link_urls_agencies() + _create_change_log_table() + _add_jsonb_diff_val_function() + _create_log_table_changes_trigger() + + + _add_table_change_log_triggers() + _add_agency_id_column() + + + +def downgrade() -> None: + _drop_data_sources_sync_task() + _drop_data_sources_sync_state_table() + _drop_change_log_table() + _drop_table_change_log_triggers() + _drop_jsonb_diff_val_function() + _drop_log_table_changes_trigger() + + _rename_link_urls_agencies_to_confirmed_url_agency() + + OperationTypeEnum.drop(op.get_bind()) + _drop_agency_id_column() + + + +def _add_jsonb_diff_val_function() -> None: + op.execute( + """ + CREATE OR REPLACE FUNCTION jsonb_diff_val(val1 JSONB, val2 JSONB) + RETURNS JSONB AS + $$ + DECLARE + result JSONB; + v RECORD; + BEGIN + result = val1; + FOR v IN SELECT * FROM jsonb_each(val2) + LOOP + IF result @> jsonb_build_object(v.key, v.value) + THEN + result = result - v.key; + ELSIF result ? v.key THEN + CONTINUE; + ELSE + result = result || jsonb_build_object(v.key, 'null'); + END IF; + END LOOP; + RETURN result; + END; + $$ LANGUAGE plpgsql; + """ + ) + +def _drop_jsonb_diff_val_function() -> None: + op.execute("DROP FUNCTION IF EXISTS jsonb_diff_val(val1 JSONB, val2 JSONB)") + +def _create_log_table_changes_trigger() -> None: + op.execute( + f""" + CREATE OR REPLACE FUNCTION public.log_table_changes() + RETURNS trigger + LANGUAGE 'plpgsql' + COST 100 + VOLATILE NOT LEAKPROOF + AS $BODY$ + DECLARE + old_values JSONB; + new_values JSONB; + old_to_new JSONB; + new_to_old JSONB; + BEGIN + -- Handle DELETE operations (store entire OLD row since all data is lost) + IF (TG_OP = 'DELETE') THEN + old_values = row_to_json(OLD)::jsonb; + + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data) + VALUES ('DELETE', TG_TABLE_NAME, OLD.id, old_values); + + RETURN OLD; + + -- Handle UPDATE operations (only log the changed columns) + ELSIF (TG_OP = 'UPDATE') THEN + old_values = row_to_json(OLD)::jsonb; + new_values = row_to_json(NEW)::jsonb; + new_to_old = jsonb_diff_val(old_values, new_values); + old_to_new = jsonb_diff_val(new_values, old_values); + + -- Skip logging if both old_to_new and new_to_old are NULL or empty JSON objects + IF (new_to_old IS NOT NULL AND new_to_old <> '{{}}') OR + (old_to_new IS NOT NULL AND old_to_new <> '{{}}') THEN + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data, new_data) + VALUES ('UPDATE', TG_TABLE_NAME, OLD.id, new_to_old, old_to_new); + END IF; + + RETURN NEW; + + -- Handle INSERT operations + ELSIF (TG_OP = 'INSERT') THEN + new_values = row_to_json(NEW)::jsonb; + + -- Skip logging if new_values is NULL or an empty JSON object + IF new_values IS NOT NULL AND new_values <> '{{}}' THEN + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, new_data) + VALUES ('INSERT', TG_TABLE_NAME, NEW.id, new_values); + END IF; + + RETURN NEW; + END IF; + END; + $BODY$; + """ + ) + +def _drop_log_table_changes_trigger() -> None: + op.execute(f"DROP TRIGGER IF EXISTS log_table_changes ON {URL_DATA_SOURCES_METADATA_TABLE_NAME}") + def _create_data_sources_sync_state_table() -> None: table = op.create_table( SYNC_STATE_TABLE_NAME, @@ -81,12 +219,67 @@ def _drop_data_sources_sync_task() -> None: ] ) +def _create_change_log_table() -> None: + # Create change_log table + op.create_table( + CHANGE_LOG_TABLE_NAME, + id_column(), + sa.Column("operation_type", OperationTypeEnum, nullable=False), + sa.Column("table_name", sa.String(), nullable=False), + sa.Column("affected_id", sa.Integer(), nullable=False), + sa.Column("old_data", JSONB, nullable=True), + sa.Column("new_data", JSONB, nullable=True), + sa.Column( + "created_at", sa.DateTime(), server_default=sa.func.now(), nullable=False + ), + ) -def upgrade() -> None: - _create_data_sources_sync_state_table() - _create_data_sources_sync_task() +def _drop_change_log_table() -> None: + op.drop_table(CHANGE_LOG_TABLE_NAME) +def _rename_confirmed_url_agency_to_link_urls_agencies() -> None: + op.rename_table(CONFIRMED_AGENCY_TABLE_NAME, LINK_URLS_AGENCIES_TABLE_NAME) -def downgrade() -> None: - _drop_data_sources_sync_task() - _drop_data_sources_sync_state_table() +def _rename_link_urls_agencies_to_confirmed_url_agency() -> None: + op.rename_table(LINK_URLS_AGENCIES_TABLE_NAME, CONFIRMED_AGENCY_TABLE_NAME) + +def _add_table_change_log_triggers() -> None: + # Create trigger for tables: + def create_table_trigger(table_name: str) -> None: + op.execute( + """ + CREATE OR REPLACE TRIGGER log_{table_name}_changes + BEFORE INSERT OR DELETE OR UPDATE + ON public.{table_name} + FOR EACH ROW + EXECUTE FUNCTION public.log_table_changes(); + """.format(table_name=table_name) + ) + + for table_name in TABLES_TO_LOG: + create_table_trigger(table_name) + +def _drop_table_change_log_triggers() -> None: + def drop_table_trigger(table_name: str) -> None: + op.execute( + f""" + DROP TRIGGER log_{table_name}_changes + ON public.{table_name} + """ + ) + + for table_name in TABLES_TO_LOG: + drop_table_trigger(table_name) + +def _add_agency_id_column(): + op.add_column( + AGENCIES_TABLE_NAME, + id_column(), + ) + + +def _drop_agency_id_column(): + op.drop_column( + AGENCIES_TABLE_NAME, + 'id', + ) diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index fcc103ac..d1c96769 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,7 +9,7 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -72,11 +72,11 @@ async def run( ) ) # Must not have confirmed agencies - .join(ConfirmedURLAgency, isouter=True) + .join(LinkURLAgency, isouter=True) .where( ~exists( - select(ConfirmedURLAgency). - where(ConfirmedURLAgency.url_id == URL.id). + select(LinkURLAgency). + where(LinkURLAgency.url_id == URL.id). correlate(URL) ) ) diff --git a/src/api/endpoints/review/approve/query.py b/src/api/endpoints/review/approve/query.py index c562fc43..14d465bf 100644 --- a/src/api/endpoints/review/approve/query.py +++ b/src/api/endpoints/review/approve/query.py @@ -10,7 +10,7 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL @@ -104,7 +104,7 @@ def update_if_not_none( session.add(agency) # If the new agency id is not in the existing agency ids, add it - confirmed_url_agency = ConfirmedURLAgency( + confirmed_url_agency = LinkURLAgency( url_id=self.approval_info.url_id, agency_id=new_agency_id ) diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 527ab1c4..2971dc16 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -13,7 +13,7 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -44,7 +44,7 @@ def __init__(self, batch_id: Optional[int] = None): self.double_join_relationships = [ (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), - (URL.confirmed_agencies, ConfirmedURLAgency.agency) + (URL.confirmed_agencies, LinkURLAgency.agency) ] self.count_label = "count" diff --git a/src/core/core.py b/src/core/core.py index 0b649b05..ec82e3c5 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -297,7 +297,6 @@ async def approve_url( user_id=access_info.user_id ) - async def reject_url( self, url_id: int, diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py index 8aa34c60..d896f765 100644 --- a/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py +++ b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py @@ -1,11 +1,11 @@ from sqlalchemy import Update, update, func, text -from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState def get_mark_full_data_sources_sync_query() -> Update: return update( - AgenciesSyncState + DataSourcesSyncState ).values( last_full_sync_at=func.now(), current_cutoff_date=func.now() - text('interval \'1 day\''), diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py index d0fe2542..164f5633 100644 --- a/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py @@ -1,9 +1,15 @@ from src.collectors.enums import URLStatus from src.db.models.instantiations.url.core.pydantic.upsert import URLUpsertModel +from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus +# upsert_urls_from_data_sources +class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): + + def __init__(self): + super().__init__() def convert_data_sources_sync_response_to_url_upsert( data_sources: list[DataSourcesSyncResponseInnerInfo] @@ -13,7 +19,6 @@ def convert_data_sources_sync_response_to_url_upsert( results.append( URLUpsertModel( id=data_source.id, - url=data_source.url, name=data_source.name, description=data_source.description, outcome=_convert_to_source_collector_url_status( diff --git a/src/db/client/async_.py b/src/db/client/async_.py index febab6b3..7865a8e2 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -102,7 +102,7 @@ from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.link.link_task_url import LinkTaskURL @@ -180,8 +180,19 @@ async def execute(self, session: AsyncSession, statement): await session.execute(statement) @session_manager - async def add(self, session: AsyncSession, model: Base): + async def add( + self, + session: AsyncSession, + model: Base, + return_id: bool = False + ) -> int | None: session.add(model) + if return_id: + if not hasattr(model, "id"): + raise AttributeError("Models must have an id attribute") + await session.flush() + return model.id + return None @session_manager async def add_all( @@ -249,6 +260,7 @@ async def bulk_upsert( @session_manager async def scalar(self, session: AsyncSession, statement): + """Fetch the first column of the first row.""" return (await session.execute(statement)).scalar() @session_manager @@ -785,14 +797,17 @@ async def upsert_new_agencies( Add or update agencies in the database """ for suggestion in suggestions: - agency = Agency( - agency_id=suggestion.pdap_agency_id, - name=suggestion.agency_name, - state=suggestion.state, - county=suggestion.county, - locality=suggestion.locality - ) - await session.merge(agency) + query = select(Agency).where(Agency.agency_id == suggestion.pdap_agency_id) + result = await session.execute(query) + agency = result.scalars().one_or_none() + if agency is None: + agency = Agency(agency_id=suggestion.pdap_agency_id) + agency.name = suggestion.agency_name + agency.state = suggestion.state + agency.county = suggestion.county + agency.locality = suggestion.locality + session.add(agency) + @session_manager async def add_confirmed_agency_url_links( @@ -801,7 +816,7 @@ async def add_confirmed_agency_url_links( suggestions: list[URLAgencySuggestionInfo] ): for suggestion in suggestions: - confirmed_agency = ConfirmedURLAgency( + confirmed_agency = LinkURLAgency( url_id=suggestion.url_id, agency_id=suggestion.pdap_agency_id ) @@ -854,7 +869,7 @@ async def add_agency_manual_suggestion( @session_manager async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: - statement = select(URL).where(exists().where(ConfirmedURLAgency.url_id == URL.id)) + statement = select(URL).where(exists().where(LinkURLAgency.url_id == URL.id)) results = await session.execute(statement) return list(results.scalars().all()) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 827d0452..558a8f18 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -119,7 +119,7 @@ def insert_url(self, session, url_info: URLInfo) -> int: url_entry = URL( url=url_info.url, collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value, + outcome=url_info.outcome, name=url_info.name ) if url_info.created_at is not None: diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 40aa8fa1..d640a851 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -9,7 +9,7 @@ from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo from src.db.dtos.url.with_html import URLWithHTML -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion @@ -128,7 +128,7 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( @staticmethod def confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies: list[ConfirmedURLAgency] + confirmed_agencies: list[LinkURLAgency] ) -> list[GetNextURLForAgencyAgencyInfo]: results = [] for confirmed_agency in confirmed_agencies: @@ -148,7 +148,7 @@ def confirmed_agencies_to_final_review_annotation_agency_info( @staticmethod def final_review_annotation_agency_info( automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agencies: list[ConfirmedURLAgency], + confirmed_agencies: list[LinkURLAgency], user_agency_suggestion: UserUrlAgencySuggestion ): diff --git a/src/db/enums.py b/src/db/enums.py index 03834e9e..25701485 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -44,6 +44,11 @@ class TaskType(PyEnum): SYNC_AGENCIES = "Sync Agencies" SYNC_DATA_SOURCES = "Sync Data Sources" +class ChangeLogOperationType(PyEnum): + INSERT = "INSERT" + UPDATE = "UPDATE" + DELETE = "DELETE" + class PGEnum(TypeDecorator): impl = postgresql.ENUM @@ -52,3 +57,4 @@ def process_bind_param(self, value: PyEnum, dialect): if isinstance(value, PyEnum): return value.value return value + diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 62dff0bd..6295415d 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -1,5 +1,5 @@ -from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey - +from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey, Enum as SAEnum +from enum import Enum as PyEnum def get_created_at_column(): return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) @@ -15,4 +15,19 @@ def get_agency_id_foreign_column( nullable=nullable ) +def enum_column( + enum_type: type[PyEnum], + name: str, + nullable: bool = False +) -> Column[SAEnum]: + return Column( + SAEnum( + enum_type, + name=name, + native_enum=True, + values_callable=lambda enum_type: [e.value for e in enum_type] + ), + nullable=nullable + ) + CURRENT_TIME_SERVER_DEFAULT = func.now() diff --git a/src/db/models/instantiations/agency/sqlalchemy.py b/src/db/models/instantiations/agency/sqlalchemy.py index 37beec3d..2ce3676f 100644 --- a/src/db/models/instantiations/agency/sqlalchemy.py +++ b/src/db/models/instantiations/agency/sqlalchemy.py @@ -6,16 +6,18 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import Base +from src.db.models.templates import Base, StandardBase class Agency( CreatedAtMixin, # When agency was added to database UpdatedAtMixin, # When agency was last updated in database - Base + StandardBase ): __tablename__ = "agencies" + # TODO: Rename agency_id to ds_agency_id + agency_id = Column(Integer, primary_key=True) name = Column(String, nullable=False) state = Column(String, nullable=True) @@ -30,4 +32,4 @@ class Agency( # Relationships automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") - confirmed_urls = relationship("ConfirmedURLAgency", back_populates="agency") + confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/instantiations/backlog_snapshot.py b/src/db/models/instantiations/backlog_snapshot.py index 240a82fd..89645160 100644 --- a/src/db/models/instantiations/backlog_snapshot.py +++ b/src/db/models/instantiations/backlog_snapshot.py @@ -1,10 +1,10 @@ from sqlalchemy import Column, Integer from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class BacklogSnapshot(CreatedAtMixin, StandardModel): +class BacklogSnapshot(CreatedAtMixin, StandardBase): __tablename__ = "backlog_snapshot" count_pending_total = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/batch/sqlalchemy.py b/src/db/models/instantiations/batch/sqlalchemy.py index 89645f4a..c1bf14fb 100644 --- a/src/db/models/instantiations/batch/sqlalchemy.py +++ b/src/db/models/instantiations/batch/sqlalchemy.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import batch_status_enum -class Batch(StandardModel): +class Batch(StandardBase): __tablename__ = 'batches' strategy = Column( diff --git a/src/db/models/instantiations/change_log.py b/src/db/models/instantiations/change_log.py new file mode 100644 index 00000000..975958ab --- /dev/null +++ b/src/db/models/instantiations/change_log.py @@ -0,0 +1,19 @@ + +from sqlalchemy import Column, Enum +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped + +from src.db.enums import ChangeLogOperationType +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates import StandardBase + + +class ChangeLog(CreatedAtMixin, StandardBase): + + __tablename__ = "change_log" + + operation_type = Column(Enum(ChangeLogOperationType, name="operation_type")) + table_name: Mapped[str] + affected_id: Mapped[int] + old_data = Column("old_data", JSONB, nullable=True) + new_data = Column("new_data", JSONB, nullable=True) diff --git a/src/db/models/instantiations/confirmed_url_agency.py b/src/db/models/instantiations/confirmed_url_agency.py index b8a50a21..4bda5eaa 100644 --- a/src/db/models/instantiations/confirmed_url_agency.py +++ b/src/db/models/instantiations/confirmed_url_agency.py @@ -3,11 +3,11 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class ConfirmedURLAgency(URLDependentMixin, StandardModel): - __tablename__ = "confirmed_url_agency" +class LinkURLAgency(URLDependentMixin, StandardBase): + __tablename__ = "link_urls_agencies" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/instantiations/duplicate/sqlalchemy.py b/src/db/models/instantiations/duplicate/sqlalchemy.py index 7a80d918..67df3af5 100644 --- a/src/db/models/instantiations/duplicate/sqlalchemy.py +++ b/src/db/models/instantiations/duplicate/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Duplicate(BatchDependentMixin, StandardModel): +class Duplicate(BatchDependentMixin, StandardBase): """ Identifies duplicates which occur within a batch """ diff --git a/src/db/models/instantiations/link/link_batch_urls.py b/src/db/models/instantiations/link/link_batch_urls.py index f357ae6a..f40edc29 100644 --- a/src/db/models/instantiations/link/link_batch_urls.py +++ b/src/db/models/instantiations/link/link_batch_urls.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin, BatchDependentMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class LinkBatchURL( @@ -9,7 +9,7 @@ class LinkBatchURL( CreatedAtMixin, URLDependentMixin, BatchDependentMixin, - StandardModel + StandardBase ): __tablename__ = "link_batch_urls" diff --git a/src/db/models/instantiations/log/sqlalchemy.py b/src/db/models/instantiations/log/sqlalchemy.py index 756e10c5..769391cf 100644 --- a/src/db/models/instantiations/log/sqlalchemy.py +++ b/src/db/models/instantiations/log/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Log(CreatedAtMixin, BatchDependentMixin, StandardModel): +class Log(CreatedAtMixin, BatchDependentMixin, StandardBase): __tablename__ = 'logs' log = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/missing.py b/src/db/models/instantiations/missing.py index 0babd91d..05665eba 100644 --- a/src/db/models/instantiations/missing.py +++ b/src/db/models/instantiations/missing.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Missing(BatchDependentMixin, StandardModel): +class Missing(BatchDependentMixin, StandardBase): __tablename__ = 'missing' place_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/root_url_cache.py b/src/db/models/instantiations/root_url_cache.py index d121ae28..4ebadd50 100644 --- a/src/db/models/instantiations/root_url_cache.py +++ b/src/db/models/instantiations/root_url_cache.py @@ -1,10 +1,10 @@ from sqlalchemy import UniqueConstraint, Column, String from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class RootURL(UpdatedAtMixin, StandardModel): +class RootURL(UpdatedAtMixin, StandardBase): __tablename__ = 'root_url_cache' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/task/core.py b/src/db/models/instantiations/task/core.py index 89c80405..514301c8 100644 --- a/src/db/models/instantiations/task/core.py +++ b/src/db/models/instantiations/task/core.py @@ -3,11 +3,11 @@ from src.db.enums import PGEnum, TaskType from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import batch_status_enum -class Task(UpdatedAtMixin, StandardModel): +class Task(UpdatedAtMixin, StandardBase): __tablename__ = 'tasks' task_type = Column( diff --git a/src/db/models/instantiations/task/error.py b/src/db/models/instantiations/task/error.py index cf1ae24f..03014904 100644 --- a/src/db/models/instantiations/task/error.py +++ b/src/db/models/instantiations/task/error.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardModel): +class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardBase): __tablename__ = 'task_errors' error = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/url/checked_for_duplicate.py b/src/db/models/instantiations/url/checked_for_duplicate.py index d5811c6e..9443d0ac 100644 --- a/src/db/models/instantiations/url/checked_for_duplicate.py +++ b/src/db/models/instantiations/url/checked_for_duplicate.py @@ -1,10 +1,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardModel): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_checked_for_duplicate' # Relationships diff --git a/src/db/models/instantiations/url/compressed_html.py b/src/db/models/instantiations/url/compressed_html.py index 5c2e06c0..206348ac 100644 --- a/src/db/models/instantiations/url/compressed_html.py +++ b/src/db/models/instantiations/url/compressed_html.py @@ -2,13 +2,13 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = 'url_compressed_html' diff --git a/src/db/models/instantiations/url/core/pydantic/upsert.py b/src/db/models/instantiations/url/core/pydantic/upsert.py index 368befbd..3492b271 100644 --- a/src/db/models/instantiations/url/core/pydantic/upsert.py +++ b/src/db/models/instantiations/url/core/pydantic/upsert.py @@ -16,7 +16,6 @@ def sa_model(self) -> type[Base]: return URL id: int - url: str name: str description: str collector_metadata: dict | None = None diff --git a/src/db/models/instantiations/url/core/sqlalchemy.py b/src/db/models/instantiations/url/core/sqlalchemy.py index 8e9860fc..c20343b6 100644 --- a/src/db/models/instantiations/url/core/sqlalchemy.py +++ b/src/db/models/instantiations/url/core/sqlalchemy.py @@ -1,13 +1,16 @@ -from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON +from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON, Enum from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.helpers import enum_column from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values -class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): +class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): __tablename__ = 'urls' # The batch this URL is associated with @@ -17,21 +20,16 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - outcome = Column( - postgresql.ENUM( - 'pending', - 'submitted', - 'validated', - 'not relevant', - 'duplicate', - 'error', - '404 not found', - 'individual record', - name='url_status' - ), - nullable=False + outcome = enum_column( + URLStatus, + name='url_status', + nullable=False + ) + record_type = enum_column( + RecordType, + name='record_type', + nullable=True ) - record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) # Relationships batch = relationship( @@ -65,7 +63,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( - "ConfirmedURLAgency", + "LinkURLAgency", ) data_source = relationship( "URLDataSource", diff --git a/src/db/models/instantiations/url/data_source.py b/src/db/models/instantiations/url/data_source.py index ad6caf46..b5bdb40d 100644 --- a/src/db/models/instantiations/url/data_source.py +++ b/src/db/models/instantiations/url/data_source.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardModel): +class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "url_data_sources" data_source_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/error_info/sqlalchemy.py b/src/db/models/instantiations/url/error_info/sqlalchemy.py index d2a09b6a..8825777f 100644 --- a/src/db/models/instantiations/url/error_info/sqlalchemy.py +++ b/src/db/models/instantiations/url/error_info/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardModel): +class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_error_info' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/html_content.py b/src/db/models/instantiations/url/html_content.py index 39ad3666..b23af35c 100644 --- a/src/db/models/instantiations/url/html_content.py +++ b/src/db/models/instantiations/url/html_content.py @@ -3,10 +3,10 @@ from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardModel): +class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_html_content' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/optional_data_source_metadata.py b/src/db/models/instantiations/url/optional_data_source_metadata.py index 84871982..fac99828 100644 --- a/src/db/models/instantiations/url/optional_data_source_metadata.py +++ b/src/db/models/instantiations/url/optional_data_source_metadata.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLOptionalDataSourceMetadata(URLDependentMixin, StandardModel): +class URLOptionalDataSourceMetadata(URLDependentMixin, StandardBase): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) diff --git a/src/db/models/instantiations/url/probed_for_404.py b/src/db/models/instantiations/url/probed_for_404.py index 3913e37e..b795b628 100644 --- a/src/db/models/instantiations/url/probed_for_404.py +++ b/src/db/models/instantiations/url/probed_for_404.py @@ -2,10 +2,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLProbedFor404(URLDependentMixin, StandardModel): +class URLProbedFor404(URLDependentMixin, StandardBase): __tablename__ = 'url_probed_for_404' last_probed_at = get_created_at_column() diff --git a/src/db/models/instantiations/url/reviewing_user.py b/src/db/models/instantiations/url/reviewing_user.py index d28a33e7..938f86ab 100644 --- a/src/db/models/instantiations/url/reviewing_user.py +++ b/src/db/models/instantiations/url/reviewing_user.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardModel): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'reviewing_user_url' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/url/suggestion/agency/auto.py b/src/db/models/instantiations/url/suggestion/agency/auto.py index 5831882f..01585535 100644 --- a/src/db/models/instantiations/url/suggestion/agency/auto.py +++ b/src/db/models/instantiations/url/suggestion/agency/auto.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardModel): +class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): __tablename__ = "automated_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/agency/user.py b/src/db/models/instantiations/url/suggestion/agency/user.py index cb92bfc0..5a54399f 100644 --- a/src/db/models/instantiations/url/suggestion/agency/user.py +++ b/src/db/models/instantiations/url/suggestion/agency/user.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class UserUrlAgencySuggestion(URLDependentMixin, StandardModel): +class UserUrlAgencySuggestion(URLDependentMixin, StandardBase): __tablename__ = "user_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/record_type/auto.py b/src/db/models/instantiations/url/suggestion/record_type/auto.py index 00d738b8..34faf6f3 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/auto.py +++ b/src/db/models/instantiations/url/suggestion/record_type/auto.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values @@ -11,7 +11,7 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/record_type/user.py b/src/db/models/instantiations/url/suggestion/record_type/user.py index cda6fb17..77954509 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/user.py +++ b/src/db/models/instantiations/url/suggestion/record_type/user.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardModel): +class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "user_record_type_suggestions" user_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py index db7f8ea2..982b4449 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py +++ b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardModel): +class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/relevant/user.py b/src/db/models/instantiations/url/suggestion/relevant/user.py index 35d30c44..b087f71e 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/user.py +++ b/src/db/models/instantiations/url/suggestion/relevant/user.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class UserRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = "user_relevant_suggestions" diff --git a/src/db/models/templates.py b/src/db/models/templates.py index 3e0a1c95..5e738fab 100644 --- a/src/db/models/templates.py +++ b/src/db/models/templates.py @@ -4,7 +4,7 @@ # Base class for SQLAlchemy ORM models Base = declarative_base() -class StandardModel(Base): +class StandardBase(Base): __abstract__ = True id = Column(Integer, primary_key=True, autoincrement=True) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index fbdc9511..91f4926f 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -7,7 +7,7 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL from src.db.models.instantiations.link.link_task_url import LinkTaskURL from src.db.models.instantiations.task.core import Task @@ -81,7 +81,7 @@ def exclude_urls_with_agency_suggestions( ) # Exclude if confirmed agencies exist statement = statement.where( - ~exists().where(ConfirmedURLAgency.url_id == URL.id) + ~exists().where(LinkURLAgency.url_id == URL.id) ) return statement diff --git a/tests/automated/integration/api/review/rejection/helpers.py b/tests/automated/integration/api/review/rejection/helpers.py index 1e825694..2162a7b8 100644 --- a/tests/automated/integration/api/review/rejection/helpers.py +++ b/tests/automated/integration/api/review/rejection/helpers.py @@ -36,4 +36,4 @@ async def run_rejection_test( assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.outcome == url_status.value + assert url.outcome == url_status diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index 9b51311a..f706a6ee 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,7 +6,7 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -54,8 +54,8 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.outcome == URLStatus.VALIDATED.value + assert url.record_type == RecordType.ARREST_RECORDS + assert url.outcome == URLStatus.VALIDATED assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -66,7 +66,7 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert optional_metadata[0].record_formats == ["New Test Record Format", "New Test Record Format 2"] # Get agencies - confirmed_agencies = await adb_client.get_all(ConfirmedURLAgency) + confirmed_agencies = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agencies) == 4 for agency in confirmed_agencies: assert agency.agency_id in agency_ids diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 90b52db4..59568266 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,7 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL @@ -41,12 +41,12 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.outcome == URLStatus.VALIDATED.value + assert url.record_type == RecordType.ARREST_RECORDS + assert url.outcome == URLStatus.VALIDATED assert url.name == "Test Name" assert url.description == "Test Description" - confirmed_agency: list[ConfirmedURLAgency] = await adb_client.get_all(ConfirmedURLAgency) + confirmed_agency: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id assert confirmed_agency[0].agency_id == agency_id diff --git a/tests/automated/integration/db/structure/README.md b/tests/automated/integration/db/structure/README.md new file mode 100644 index 00000000..2e22a324 --- /dev/null +++ b/tests/automated/integration/db/structure/README.md @@ -0,0 +1,6 @@ +Database Structure tests, in this instance +Test the integrity of the database schema and that it behaves as expected. + +This includes testing that: +* Enum columns allow only allowed values (and throw errors on others) +* Column types are correct diff --git a/tests/automated/integration/db/structure/__init__.py b/tests/automated/integration/db/structure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/test_batch.py b/tests/automated/integration/db/structure/test_batch.py new file mode 100644 index 00000000..7f7bfcf3 --- /dev/null +++ b/tests/automated/integration/db/structure/test_batch.py @@ -0,0 +1,88 @@ +import sqlalchemy as sa +from sqlalchemy import create_engine +from sqlalchemy.dialects import postgresql + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.helpers import get_postgres_connection_string +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester + + +def test_batch(wiped_database): + engine = create_engine(get_postgres_connection_string()) + table_tester = TableTester( + table_name="batches", + columns=[ + ColumnTester( + column_name="strategy", + type_=postgresql.ENUM, + allowed_values=get_enum_values(CollectorType), + ), + ColumnTester( + column_name="user_id", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="status", + type_=postgresql.ENUM, + allowed_values=get_enum_values(BatchStatus), + ), + ColumnTester( + column_name="total_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="original_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="duplicate_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="strategy_success_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="metadata_success_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="agency_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="record_type_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="record_category_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="compute_time", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="parameters", + type_=sa.JSON, + allowed_values=[{}] + ) + + ], + engine=engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_html_content.py b/tests/automated/integration/db/structure/test_html_content.py new file mode 100644 index 00000000..8c9c3207 --- /dev/null +++ b/tests/automated/integration/db/structure/test_html_content.py @@ -0,0 +1,38 @@ +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.enums import URLHTMLContentType +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_html_content(db_data_creator: DBDataCreator): + batch_id = db_data_creator.batch() + iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) + + table_tester = TableTester( + table_name="url_html_content", + columns=[ + ColumnTester( + column_name="url_id", + type_=sa.Integer, + allowed_values=[iui.url_mappings[0].url_id] + ), + ColumnTester( + column_name="content_type", + type_=postgresql.ENUM, + allowed_values=get_enum_values(URLHTMLContentType) + ), + ColumnTester( + column_name="content", + type_=sa.Text, + allowed_values=["Text"] + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_root_url.py b/tests/automated/integration/db/structure/test_root_url.py new file mode 100644 index 00000000..7c3712df --- /dev/null +++ b/tests/automated/integration/db/structure/test_root_url.py @@ -0,0 +1,32 @@ +import sqlalchemy as sa + +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_root_url(db_data_creator: DBDataCreator): + + table_tester = TableTester( + table_name="root_urls", + columns=[ + ColumnTester( + column_name="url", + type_=sa.String, + allowed_values=["https://example.com"] + ), + ColumnTester( + column_name="page_title", + type_=sa.String, + allowed_values=["Text"] + ), + ColumnTester( + column_name="page_description", + type_=sa.String, + allowed_values=["Text"] + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_upsert_new_agencies.py b/tests/automated/integration/db/structure/test_upsert_new_agencies.py new file mode 100644 index 00000000..17a184f4 --- /dev/null +++ b/tests/automated/integration/db/structure/test_upsert_new_agencies.py @@ -0,0 +1,59 @@ +import pytest + +from src.core.enums import SuggestionType +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.helpers.db_data_creator import DBDataCreator + + +@pytest.mark.asyncio +async def test_upsert_new_agencies( + wiped_database, + db_data_creator: DBDataCreator +): + """ + Check that if the agency doesn't exist, it is added + But if the agency does exist, it is updated with new information + """ + + suggestions = [] + for i in range(3): + suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=i, + agency_name=f"Test Agency {i}", + state=f"Test State {i}", + county=f"Test County {i}", + locality=f"Test Locality {i}", + user_id=1 + ) + suggestions.append(suggestion) + + adb_client = db_data_creator.adb_client + await adb_client.upsert_new_agencies(suggestions) + + update_suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=0, + agency_name="Updated Test Agency", + state="Updated Test State", + county="Updated Test County", + locality="Updated Test Locality", + user_id=1 + ) + + await adb_client.upsert_new_agencies([update_suggestion]) + + rows = await adb_client.get_all(Agency, order_by_attribute="agency_id") + + assert len(rows) == 3 + + d = {} + for row in rows: + d[row.agency_id] = row.name + + assert d[0] == "Updated Test Agency" + assert d[1] == "Test Agency 1" + assert d[2] == "Test Agency 2" diff --git a/tests/automated/integration/db/structure/test_url.py b/tests/automated/integration/db/structure/test_url.py new file mode 100644 index 00000000..c9c3cf79 --- /dev/null +++ b/tests/automated/integration/db/structure/test_url.py @@ -0,0 +1,45 @@ +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from src.collectors.enums import URLStatus +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_url(db_data_creator: DBDataCreator): + batch_id = db_data_creator.batch() + table_tester = TableTester( + table_name="urls", + columns=[ + ColumnTester( + column_name="batch_id", + type_=sa.Integer, + allowed_values=[batch_id], + ), + ColumnTester( + column_name="url", + type_=sa.String, + allowed_values=["https://example.com"], + ), + ColumnTester( + column_name="collector_metadata", + type_=sa.JSON, + allowed_values=[{}] + ), + ColumnTester( + column_name="outcome", + type_=postgresql.ENUM, + allowed_values=get_enum_values(URLStatus) + ), + ColumnTester( + column_name="name", + type_=sa.String, + allowed_values=['test'], + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/testers/__init__.py b/tests/automated/integration/db/structure/testers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/testers/models/__init__.py b/tests/automated/integration/db/structure/testers/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/testers/models/column.py b/tests/automated/integration/db/structure/testers/models/column.py new file mode 100644 index 00000000..1b4c5a50 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/column.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + +from tests.automated.integration.db.structure.types import SATypes + + +@dataclass +class ColumnTester: + column_name: str + type_: SATypes + allowed_values: list diff --git a/tests/automated/integration/db/structure/testers/models/foreign_key.py b/tests/automated/integration/db/structure/testers/models/foreign_key.py new file mode 100644 index 00000000..517a82a8 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/foreign_key.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + + +@dataclass +class ForeignKeyTester: + column_name: str + valid_id: int + invalid_id: int diff --git a/tests/automated/integration/db/structure/testers/models/unique_constraint.py b/tests/automated/integration/db/structure/testers/models/unique_constraint.py new file mode 100644 index 00000000..baa85cbb --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/unique_constraint.py @@ -0,0 +1,6 @@ +from dataclasses import dataclass + + +@dataclass +class UniqueConstraintTester: + columns: list[str] diff --git a/tests/automated/integration/db/structure/testers/table.py b/tests/automated/integration/db/structure/testers/table.py new file mode 100644 index 00000000..ca594eb4 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/table.py @@ -0,0 +1,95 @@ +from typing import Optional, Any + +import pytest +import sqlalchemy as sa +from sqlalchemy import create_engine +from sqlalchemy.dialects import postgresql +from sqlalchemy.exc import DataError + +from src.db.helpers import get_postgres_connection_string +from src.db.models.templates import Base +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.types import ConstraintTester, SATypes + + +class TableTester: + + def __init__( + self, + columns: list[ColumnTester], + table_name: str, + engine: Optional[sa.Engine] = None, + constraints: Optional[list[ConstraintTester]] = None, + ): + if engine is None: + engine = create_engine(get_postgres_connection_string(is_async=True)) + self.columns = columns + self.table_name = table_name + self.constraints = constraints + self.engine = engine + + def run_tests(self): + pass + + def setup_row_dict(self, override: Optional[dict[str, Any]] = None): + d = {} + for column in self.columns: + # For row dicts, the first value is the default + d[column.column_name] = column.allowed_values[0] + if override is not None: + d.update(override) + return d + + def run_column_test(self, column: ColumnTester): + if len(column.allowed_values) == 1: + return # It will be tested elsewhere + for value in column.allowed_values: + print(f"Testing column {column.column_name} with value {value}") + row_dict = self.setup_row_dict(override={column.column_name: value}) + table = self.get_table_model() + with self.engine.begin() as conn: + # Delete existing rows + conn.execute(table.delete()) + conn.commit() + with self.engine.begin() as conn: + conn.execute(table.insert(), row_dict) + conn.commit() + conn.close() + self.test_invalid_values(column) + + def generate_invalid_value(self, type_: SATypes): + match type_: + case sa.Integer: + return "not an integer" + case sa.String: + return -1 + case postgresql.ENUM: + return "not an enum value" + case sa.TIMESTAMP: + return "not a timestamp" + + def test_invalid_values(self, column: ColumnTester): + invalid_value = self.generate_invalid_value(type_=column.type_) + row_dict = self.setup_row_dict(override={column.column_name: invalid_value}) + table = self.get_table_model() + print(f"Testing column '{column.column_name}' with invalid value {invalid_value}") + with pytest.raises(DataError): + with self.engine.begin() as conn: + conn.execute(table.delete()) + conn.commit() + with self.engine.begin() as conn: + conn.execute(table.insert(), row_dict) + conn.commit() + conn.close() + + + def get_table_model(self) -> sa.Table: + """ + Retrieve table model from metadata + """ + return sa.Table(self.table_name, Base.metadata, autoload_with=self.engine) + + + def run_column_tests(self): + for column in self.columns: + self.run_column_test(column) diff --git a/tests/automated/integration/db/structure/types.py b/tests/automated/integration/db/structure/types.py new file mode 100644 index 00000000..3124538f --- /dev/null +++ b/tests/automated/integration/db/structure/types.py @@ -0,0 +1,10 @@ +from typing import TypeAlias + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from tests.automated.integration.db.structure.testers.models.foreign_key import ForeignKeyTester +from tests.automated.integration.db.structure.testers.models.unique_constraint import UniqueConstraintTester + +SATypes: TypeAlias = sa.Integer or sa.String or postgresql.ENUM or sa.TIMESTAMP or sa.Text +ConstraintTester: TypeAlias = UniqueConstraintTester or ForeignKeyTester diff --git a/tests/automated/integration/db/test_change_log.py b/tests/automated/integration/db/test_change_log.py new file mode 100644 index 00000000..dde2d702 --- /dev/null +++ b/tests/automated/integration/db/test_change_log.py @@ -0,0 +1,96 @@ +import pytest +from sqlalchemy import update, delete + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import ChangeLogOperationType +from src.db.models.instantiations.change_log import ChangeLog +from src.db.models.instantiations.url.core.sqlalchemy import URL + + +class _TestChangeGetter: + + def __init__(self, adb: AsyncDatabaseClient): + self.adb = adb + + async def get_change_log_entries(self): + return await self.adb.get_all(ChangeLog) + +@pytest.mark.asyncio +async def test_change_log(wiped_database, adb_client_test: AsyncDatabaseClient): + getter = _TestChangeGetter(adb_client_test) + + # Confirm no entries in the change log table + entries = await getter.get_change_log_entries() + assert len(entries) == 0 + + # Add entry to URL table + url = URL( + url="test_url", + name="test_name", + description="test_description", + outcome='pending' + ) + url_id = await adb_client_test.add(url, return_id=True) + + # Choose a single logged table -- URL -- for testing + entries = await getter.get_change_log_entries() + assert len(entries) == 1 + entry: ChangeLog = entries[0] + assert entry.operation_type == ChangeLogOperationType.INSERT + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is None + assert entry.new_data is not None + nd = entry.new_data + assert nd["id"] == url_id + assert nd["url"] == "test_url" + assert nd["name"] == "test_name" + assert nd["description"] == "test_description" + assert nd["outcome"] == "pending" + assert nd["created_at"] is not None + assert nd["updated_at"] is not None + assert nd['record_type'] is None + assert nd['collector_metadata'] is None + + # Update URL + + await adb_client_test.execute( + update(URL).where(URL.id == url_id).values( + name="new_name", + description="new_description" + ) + ) + + # Confirm change log entry + entries = await getter.get_change_log_entries() + assert len(entries) == 2 + entry: ChangeLog = entries[1] + assert entry.operation_type == ChangeLogOperationType.UPDATE + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is not None + assert entry.new_data is not None + od = entry.old_data + nd = entry.new_data + assert nd['description'] == "new_description" + assert od['description'] == "test_description" + assert nd['name'] == "new_name" + assert od['name'] == "test_name" + assert nd['updated_at'] is not None + assert od['updated_at'] is not None + + # Delete URL + await adb_client_test.execute( + delete(URL).where(URL.id == url_id) + ) + + # Confirm change log entry + entries = await getter.get_change_log_entries() + assert len(entries) == 3 + entry: ChangeLog = entries[2] + assert entry.operation_type == ChangeLogOperationType.DELETE + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is not None + assert entry.new_data is None + diff --git a/tests/automated/integration/db/test_database_structure.py b/tests/automated/integration/db/test_database_structure.py deleted file mode 100644 index 4b73bd3d..00000000 --- a/tests/automated/integration/db/test_database_structure.py +++ /dev/null @@ -1,348 +0,0 @@ -""" -Database Structure tests, in this instance -Test the integrity of the database schema and that it behaves as expected. - -This includes testing that: -* Enum columns allow only allowed values (and throw errors on others) -* Column types are correct -""" - -from dataclasses import dataclass -from typing import TypeAlias, Optional, Any - -import pytest -import sqlalchemy as sa -from sqlalchemy import create_engine -from sqlalchemy.dialects import postgresql -from sqlalchemy.exc import DataError - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.enums import URLHTMLContentType -from src.db.helpers import get_postgres_connection_string -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.collectors.enums import CollectorType, URLStatus -from src.core.enums import BatchStatus, SuggestionType -from src.db.models.templates import Base -from src.util.helper_functions import get_enum_values -from tests.helpers.db_data_creator import DBDataCreator - -SATypes: TypeAlias = sa.Integer or sa.String or postgresql.ENUM or sa.TIMESTAMP or sa.Text - -@dataclass -class ColumnTester: - column_name: str - type_: SATypes - allowed_values: list - -@dataclass -class UniqueConstraintTester: - columns: list[str] - -@dataclass -class ForeignKeyTester: - column_name: str - valid_id: int - invalid_id: int - -ConstraintTester: TypeAlias = UniqueConstraintTester or ForeignKeyTester - -class TableTester: - - def __init__( - self, - columns: list[ColumnTester], - table_name: str, - engine: Optional[sa.Engine] = None, - constraints: Optional[list[ConstraintTester]] = None, - ): - if engine is None: - engine = create_engine(get_postgres_connection_string(is_async=True)) - self.columns = columns - self.table_name = table_name - self.constraints = constraints - self.engine = engine - - def run_tests(self): - pass - - def setup_row_dict(self, override: Optional[dict[str, Any]] = None): - d = {} - for column in self.columns: - # For row dicts, the first value is the default - d[column.column_name] = column.allowed_values[0] - if override is not None: - d.update(override) - return d - - def run_column_test(self, column: ColumnTester): - if len(column.allowed_values) == 1: - return # It will be tested elsewhere - for value in column.allowed_values: - print(f"Testing column {column.column_name} with value {value}") - row_dict = self.setup_row_dict(override={column.column_name: value}) - table = self.get_table_model() - with self.engine.begin() as conn: - # Delete existing rows - conn.execute(table.delete()) - conn.commit() - with self.engine.begin() as conn: - conn.execute(table.insert(), row_dict) - conn.commit() - conn.close() - self.test_invalid_values(column) - - def generate_invalid_value(self, type_: SATypes): - match type_: - case sa.Integer: - return "not an integer" - case sa.String: - return -1 - case postgresql.ENUM: - return "not an enum value" - case sa.TIMESTAMP: - return "not a timestamp" - - def test_invalid_values(self, column: ColumnTester): - invalid_value = self.generate_invalid_value(type_=column.type_) - row_dict = self.setup_row_dict(override={column.column_name: invalid_value}) - table = self.get_table_model() - print(f"Testing column '{column.column_name}' with invalid value {invalid_value}") - with pytest.raises(DataError): - with self.engine.begin() as conn: - conn.execute(table.delete()) - conn.commit() - with self.engine.begin() as conn: - conn.execute(table.insert(), row_dict) - conn.commit() - conn.close() - - - def get_table_model(self) -> sa.Table: - """ - Retrieve table model from metadata - """ - return sa.Table(self.table_name, Base.metadata, autoload_with=self.engine) - - - def run_column_tests(self): - for column in self.columns: - self.run_column_test(column) - - -def test_batch(wiped_database): - engine = create_engine(get_postgres_connection_string()) - table_tester = TableTester( - table_name="batches", - columns=[ - ColumnTester( - column_name="strategy", - type_=postgresql.ENUM, - allowed_values=get_enum_values(CollectorType), - ), - ColumnTester( - column_name="user_id", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="status", - type_=postgresql.ENUM, - allowed_values=get_enum_values(BatchStatus), - ), - ColumnTester( - column_name="total_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="original_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="duplicate_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="strategy_success_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="metadata_success_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="agency_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="record_type_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="record_category_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="compute_time", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="parameters", - type_=sa.JSON, - allowed_values=[{}] - ) - - ], - engine=engine - ) - - table_tester.run_column_tests() - -def test_url(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - table_tester = TableTester( - table_name="urls", - columns=[ - ColumnTester( - column_name="batch_id", - type_=sa.Integer, - allowed_values=[batch_id], - ), - ColumnTester( - column_name="url", - type_=sa.String, - allowed_values=["https://example.com"], - ), - ColumnTester( - column_name="collector_metadata", - type_=sa.JSON, - allowed_values=[{}] - ), - ColumnTester( - column_name="outcome", - type_=postgresql.ENUM, - allowed_values=get_enum_values(URLStatus) - ), - ColumnTester( - column_name="name", - type_=sa.String, - allowed_values=['test'], - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - -def test_html_content(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) - - table_tester = TableTester( - table_name="url_html_content", - columns=[ - ColumnTester( - column_name="url_id", - type_=sa.Integer, - allowed_values=[iui.url_mappings[0].url_id] - ), - ColumnTester( - column_name="content_type", - type_=postgresql.ENUM, - allowed_values=get_enum_values(URLHTMLContentType) - ), - ColumnTester( - column_name="content", - type_=sa.Text, - allowed_values=["Text"] - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - -def test_root_url(db_data_creator: DBDataCreator): - - table_tester = TableTester( - table_name="root_urls", - columns=[ - ColumnTester( - column_name="url", - type_=sa.String, - allowed_values=["https://example.com"] - ), - ColumnTester( - column_name="page_title", - type_=sa.String, - allowed_values=["Text"] - ), - ColumnTester( - column_name="page_description", - type_=sa.String, - allowed_values=["Text"] - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - - -@pytest.mark.asyncio -async def test_upsert_new_agencies(db_data_creator: DBDataCreator): - """ - Check that if the agency doesn't exist, it is added - But if the agency does exist, it is updated with new information - """ - - suggestions = [] - for i in range(3): - suggestion = URLAgencySuggestionInfo( - url_id=1, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=i, - agency_name=f"Test Agency {i}", - state=f"Test State {i}", - county=f"Test County {i}", - locality=f"Test Locality {i}", - user_id=1 - ) - suggestions.append(suggestion) - - adb_client = db_data_creator.adb_client - await adb_client.upsert_new_agencies(suggestions) - - update_suggestion = URLAgencySuggestionInfo( - url_id=1, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=0, - agency_name="Updated Test Agency", - state="Updated Test State", - county="Updated Test County", - locality="Updated Test Locality", - user_id=1 - ) - - await adb_client.upsert_new_agencies([update_suggestion]) - - rows = await adb_client.get_all(Agency) - - assert len(rows) == 3 - - d = {} - for row in rows: - d[row.agency_id] = row.name - - assert d[0] == "Updated Test Agency" - assert d[1] == "Test Agency 1" - assert d[2] == "Test Agency 2" diff --git a/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py index 593ec1e1..6fe988a6 100644 --- a/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py @@ -2,7 +2,7 @@ from datetime import timedelta from unittest.mock import patch -from sqlalchemy import select, func, TIMESTAMP, cast +from sqlalchemy import select, func, TIMESTAMP, cast, update from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.agency.sqlalchemy import Agency @@ -21,7 +21,7 @@ async def check_sync_concluded( ) ) - sync_state_results = await db_client.scalar( + sync_state_results = await db_client.mapping( select( AgenciesSyncState ) @@ -45,18 +45,13 @@ async def check_sync_concluded( async def update_existing_agencies_updated_at(db_data_creator): - update_mappings = [] for preexisting_agency in PREEXISTING_AGENCIES: - update_mapping = { - "agency_id": preexisting_agency.agency_id, - "updated_at": preexisting_agency.updated_at - } - update_mappings.append(update_mapping) - await db_data_creator.adb_client.bulk_update( - model=Agency, - mappings=update_mappings, - ) - + query = ( + update(Agency) + .where(Agency.agency_id == preexisting_agency.agency_id) + .values(updated_at=preexisting_agency.updated_at) + ) + await db_data_creator.adb_client.execute(query) async def add_existing_agencies(db_data_creator): agencies_to_add = [] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py index 67019539..470504ab 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py @@ -1,11 +1,16 @@ import pytest_asyncio from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.external.pdap.client import PDAPClient +from tests.helpers.db_data_creator import DBDataCreator @pytest_asyncio.fixture -async def setup( - db_data_creator, - mock_pdap_client +async def test_operator( + db_data_creator: DBDataCreator, + mock_pdap_client: PDAPClient ) -> SyncDataSourcesTaskOperator: - raise NotImplementedError \ No newline at end of file + return SyncDataSourcesTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py index 3e4cc3c5..64e0f742 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -1,6 +1,6 @@ from collections import defaultdict -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo @@ -12,7 +12,7 @@ def __init__( self, responses: list[DataSourcesSyncResponseInfo], url_ds_links: list[URLDataSource], - url_agency_links: list[ConfirmedURLAgency] + url_agency_links: list[LinkURLAgency] ): self._ds_id_response_dict: dict[int, DataSourcesSyncResponseInnerInfo] = {} for response in responses: diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py index 936d935e..932d2518 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py @@ -1,186 +1,7 @@ from contextlib import contextmanager -from datetime import datetime from unittest.mock import patch -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source import URLDataSource from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import TestURLSetupEntry, \ - SyncResponseOrder, TestURLPostSetupRecord, AgencyAssigned -from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.info import TestDataSourcesSyncSetupInfo -from tests.helpers.db_data_creator import DBDataCreator - - -async def setup_data( - db_data_creator: DBDataCreator, - mock_pdap_client: PDAPClient -) -> TestDataSourcesSyncSetupInfo: - adb_client = db_data_creator.adb_client - - agency_id_preexisting_urls = await db_data_creator.agency() - agency_id_new_urls = await db_data_creator.agency() - - # Setup data sources - - - # Setup pre-existing urls - preexisting_urls = [ - URL( - url='https://example.com/1', - name='Pre-existing URL 1', - description='Pre-existing URL 1 Description', - collector_metadata={}, - outcome=URLStatus.PENDING.value, - record_type=RecordType.ACCIDENT_REPORTS.value, - updated_at=datetime(2023, 1, 1, 0, 0, 0), - ), - URL( - url='https://example.com/2', - name='Pre-existing URL 2', - description='Pre-existing URL 2 Description', - collector_metadata={}, - outcome=URLStatus.VALIDATED.value, - record_type=RecordType.ACCIDENT_REPORTS.value, - updated_at=datetime(2025, 10, 17, 3, 0, 0), - ), - ] - preexisting_url_ids = await adb_client.add_all(preexisting_urls, return_ids=True) - # Link second pre-existing url to data source - await adb_client.add(URLDataSource( - url_id=preexisting_url_ids[1], - data_source_id=preexisting_url_ids[1] - )) - - # Link second pre-existing url to agency - await adb_client.add(ConfirmedURLAgency( - url_id=preexisting_url_ids[1], - agency_id=agency_id_preexisting_urls - )) - - - first_call_response = DataSourcesSyncResponseInfo( - data_sources=[ - DataSourcesSyncResponseInnerInfo( - id=120, - url="https://newurl.com/1", - name="New URL 1", - description="New URL 1 Description", - approval_status=ApprovalStatus.APPROVED, - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[agency_id_new_urls], - url_status=DataSourcesURLStatus.OK - ), - DataSourcesSyncResponseInnerInfo( - id=121, - url="https://newurl.com/2", - name="New URL 2", - description="New URL 2 Description", - approval_status=ApprovalStatus.APPROVED, - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.FIELD_CONTACTS, - agency_ids=[agency_id_new_urls], - url_status=DataSourcesURLStatus.BROKEN - ), - DataSourcesSyncResponseInnerInfo( - id=122, - url="https://newurl.com/3", - name="New URL 3", - description="New URL 3 Description", - approval_status=ApprovalStatus.PENDING, - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.WANTED_PERSONS, - agency_ids=[agency_id_new_urls], - url_status=DataSourcesURLStatus.OK - ), - DataSourcesSyncResponseInnerInfo( - id=123, - url="https://newurl.com/4", - name="New URL 4", - description="New URL 4 Description", - approval_status=ApprovalStatus.NEEDS_IDENTIFICATION, - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.STOPS, - agency_ids=[agency_id_new_urls], - url_status=DataSourcesURLStatus.OK - ), - DataSourcesSyncResponseInnerInfo( - id=preexisting_url_ids[0], - url="https://newurl.com/5", - name="Updated Preexisting URL 1", - description="Updated Preexisting URL 1 Description", - approval_status=ApprovalStatus.REJECTED, # Status should update to rejected. - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.BOOKING_REPORTS, - agency_ids=[agency_id_preexisting_urls, agency_id_new_urls], - url_status=DataSourcesURLStatus.OK - ) - ] - ) - second_call_response = DataSourcesSyncResponseInfo( - data_sources=[ - DataSourcesSyncResponseInnerInfo( - id=preexisting_url_ids[1], - url="https://newurl.com/6", - name="Updated Preexisting URL 2", - description="Updated Preexisting URL 2 Description", - approval_status=ApprovalStatus.APPROVED, # SC should stay validated - updated_at=datetime(2023, 1, 1, 0, 0, 0), - record_type=RecordType.PERSONNEL_RECORDS, - agency_ids=[agency_id_new_urls], - url_status=DataSourcesURLStatus.OK - ), - ] - ) - third_call_response = DataSourcesSyncResponseInfo(data_sources=[]) - - - - -class DataSourcesSyncTestSetupManager: - - def __init__( - self, - adb_client: AsyncDatabaseClient, - entries: list[TestURLSetupEntry] - ): - self.adb_client = adb_client - self.entries = entries - - self.response_dict: dict[ - SyncResponseOrder, list[DataSourcesSyncResponseInfo] - ] = { - e: [] for e in SyncResponseOrder - } - self.test_agency_dict: dict[ - AgencyAssigned, int - ] = {} - - async def setup(self): - await self.setup_agencies() - - async def setup_entries(self): - for entry in self.entries: - await self.setup_entry(entry) - - async def setup_entry( - self, - entry: TestURLSetupEntry - ) -> TestURLPostSetupRecord: - if entry.sc_info is not None: - # TODO: Add SC entry - raise NotImplementedError() - if entry.ds_info is not None: - # TODO: Add DS entry - raise NotImplementedError() @contextmanager diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py index d947e061..ddc7b9d6 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py @@ -1,55 +1,10 @@ -from enum import Enum - -from pydantic import BaseModel - from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus - -class SyncResponseOrder(Enum): - """Represents which sync response the entry is in.""" - FIRST = 1 - SECOND = 2 - # No entries should be in 3 - THIRD = 3 - -class AgencyAssigned(Enum): - """Represents which of several pre-created agencies the entry is assigned to.""" - ONE = 1 - TWO = 2 - THREE = 3 - -class TestDSURLSetupEntry(BaseModel): - """Represents URL previously existing in DS DB. - - These values should overwrite any SC values - """ - id: int # ID of URL in DS App - name: str - description: str - url_status: DataSourcesURLStatus - approval_status: ApprovalStatus - record_type: RecordType - agency_ids: list[AgencyAssigned] - sync_response_order: SyncResponseOrder - -class TestSCURLSetupEntry(BaseModel): - """Represents URL previously existing in SC DB. - - These values should be overridden by any DS values - """ - name: str - description: str - record_type: RecordType - url_status: URLStatus - agency_ids: list[AgencyAssigned] - -class TestURLSetupEntry(BaseModel): - url: str - ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB - sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB - - final_status: URLStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder, AgencyAssigned +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import TestSCURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry ENTRIES = [ TestURLSetupEntry( @@ -62,7 +17,7 @@ class TestURLSetupEntry(BaseModel): url_status=DataSourcesURLStatus.OK, approval_status=ApprovalStatus.APPROVED, record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[AgencyAssigned.ONE, AgencyAssigned.TWO], + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.TWO], sync_response_order=SyncResponseOrder.FIRST ), sc_info=TestSCURLSetupEntry( @@ -70,9 +25,9 @@ class TestURLSetupEntry(BaseModel): description='Pre-existing URL 1 Description', record_type=RecordType.ACCIDENT_REPORTS, url_status=URLStatus.PENDING, - agency_ids=[AgencyAssigned.ONE, AgencyAssigned.THREE] + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] ), - final_status=URLStatus.VALIDATED + final_url_status=URLStatus.VALIDATED ), TestURLSetupEntry( # A DS-only approved but broken URL @@ -84,11 +39,11 @@ class TestURLSetupEntry(BaseModel): url_status=DataSourcesURLStatus.BROKEN, approval_status=ApprovalStatus.APPROVED, record_type=RecordType.INCARCERATION_RECORDS, - agency_ids=[AgencyAssigned.TWO], + agencies_assigned=[AgencyAssigned.TWO], sync_response_order=SyncResponseOrder.FIRST ), sc_info=None, - final_status=URLStatus.NOT_FOUND + final_url_status=URLStatus.NOT_FOUND ), TestURLSetupEntry( # An SC-only pending URL, should be unchanged. @@ -99,9 +54,9 @@ class TestURLSetupEntry(BaseModel): description='Pre-existing URL 3 Description', record_type=RecordType.FIELD_CONTACTS, url_status=URLStatus.PENDING, - agency_ids=[AgencyAssigned.ONE, AgencyAssigned.THREE] + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] ), - final_status=URLStatus.PENDING + final_url_status=URLStatus.PENDING ), TestURLSetupEntry( # A DS-only rejected URL @@ -113,11 +68,11 @@ class TestURLSetupEntry(BaseModel): url_status=DataSourcesURLStatus.OK, approval_status=ApprovalStatus.REJECTED, record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[AgencyAssigned.ONE], + agencies_assigned=[AgencyAssigned.ONE], sync_response_order=SyncResponseOrder.FIRST ), sc_info=None, - final_status=URLStatus.NOT_RELEVANT + final_url_status=URLStatus.NOT_RELEVANT ), TestURLSetupEntry( # A pre-existing URL in the second response @@ -128,26 +83,19 @@ class TestURLSetupEntry(BaseModel): description='New URL 5 Description', url_status=DataSourcesURLStatus.OK, approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[AgencyAssigned.ONE], + record_type=RecordType.INCARCERATION_RECORDS, + agencies_assigned=[AgencyAssigned.ONE], sync_response_order=SyncResponseOrder.SECOND ), sc_info=TestSCURLSetupEntry( name='Pre-existing URL 5 Name', description='Pre-existing URL 5 Description', - record_type=RecordType.ACCIDENT_REPORTS, + record_type=None, url_status=URLStatus.PENDING, - agency_ids=[] + agencies_assigned=[] ), - final_status=URLStatus.VALIDATED + final_url_status=URLStatus.VALIDATED ) ] -class TestURLPostSetupRecord(BaseModel): - """Stores a setup entry along with relevant database-generated ids""" - url_id: int - sc_setup_entry: TestSCURLSetupEntry | None - ds_setup_entry: TestDSURLSetupEntry | None - sc_agency_ids: list[int] | None - ds_agency_ids: list[int] | None \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py new file mode 100644 index 00000000..fd1e1da2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py @@ -0,0 +1,16 @@ +from enum import Enum + + +class SyncResponseOrder(Enum): + """Represents which sync response the entry is in.""" + FIRST = 1 + SECOND = 2 + # No entries should be in 3 + THIRD = 3 + + +class AgencyAssigned(Enum): + """Represents which of several pre-created agencies the entry is assigned to.""" + ONE = 1 + TWO = 2 + THREE = 3 diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py deleted file mode 100644 index f16bdfa7..00000000 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/info.py +++ /dev/null @@ -1,28 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo - - -class TestDataSourcesSyncSetupInfo(BaseModel): - - class Config: - allow_arbitrary_types = True - - operator: SyncDataSourcesTaskOperator - db_client: AsyncDatabaseClient - preexisting_urls: list[URL] - preexisting_urls_ids: list[int] - first_call_response: DataSourcesSyncResponseInfo - second_call_response: DataSourcesSyncResponseInfo - third_call_response: DataSourcesSyncResponseInfo - - @property - def data_sources_sync_response(self) -> list[DataSourcesSyncResponseInfo]: - return [ - self.first_call_response, - self.second_call_response, - self.third_call_response - ] \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py new file mode 100644 index 00000000..f7fd5765 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py @@ -0,0 +1,31 @@ +from sqlalchemy import select + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned + + +class AgencyAssignmentManager: + + def __init__(self, adb_client: AsyncDatabaseClient): + self.adb_client = adb_client + self._dict: dict[AgencyAssigned, int] = {} + + async def setup(self): + agencies = [] + for ag_enum in AgencyAssigned: + agency = Agency( + agency_id=ag_enum.value, + name=f"Test Agency {ag_enum.name}", + state="test_state", + county="test_county", + locality="test_locality" + ) + agencies.append(agency) + await self.adb_client.add_all(agencies) + agency_ids = await self.adb_client.scalars(select(Agency.agency_id)) + for ag_enum, agency_id in zip(AgencyAssigned, agency_ids): + self._dict[ag_enum] = agency_id + + async def get(self, ag_enum: AgencyAssigned) -> int: + return self._dict[ag_enum] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py new file mode 100644 index 00000000..0720edfa --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py @@ -0,0 +1,96 @@ +from collections import defaultdict + +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo, DataSourcesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.agency import AgencyAssignmentManager +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.queries.check import \ + CheckURLQueryBuilder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.url import URLSetupFunctor +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord + + +class DataSourcesSyncTestSetupManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + entries: list[TestURLSetupEntry], + ): + self.adb_client = adb_client + self.entries = entries + self.agency_assignment_manager = AgencyAssignmentManager(self.adb_client) + + self.url_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} + self.ds_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} + + self.response_dict: dict[ + SyncResponseOrder, list[DataSourcesSyncResponseInnerInfo] + ] = defaultdict(list) + + async def setup(self): + await self.setup_agencies() + await self.setup_entries() + + async def setup_entries(self): + for entry in self.entries: + await self.setup_entry(entry) + + async def setup_entry( + self, + entry: TestURLSetupEntry + ) -> None: + """ + Modifies: + self.url_id_to_setup_record + self.ds_id_to_setup_record + self.response_dict + """ + functor = URLSetupFunctor( + entry=entry, + agency_assignment_manager=self.agency_assignment_manager, + adb_client=self.adb_client + ) + result = await functor() + response_info = result.ds_response_info + if response_info is not None: + self.response_dict[entry.ds_info.sync_response_order].append(response_info) + if result.url_id is not None: + self.url_id_to_setup_record[result.url_id] = result + if result.data_sources_id is not None: + self.ds_id_to_setup_record[result.data_sources_id] = result + + async def setup_agencies(self): + await self.agency_assignment_manager.setup() + + async def get_data_sources_sync_responses( + self, + orders: list[SyncResponseOrder] + ) -> list[DataSourcesSyncResponseInfo]: + results = [] + for order in orders: + results.append( + DataSourcesSyncResponseInfo( + data_sources=self.response_dict[order] + ) + ) + return results + + async def check_via_url(self, url_id: int): + builder = CheckURLQueryBuilder( + record=self.url_id_to_setup_record[url_id] + ) + await self.adb_client.run_query_builder(builder) + + async def check_via_data_source(self, data_source_id: int): + builder = CheckURLQueryBuilder( + record=self.ds_id_to_setup_record[data_source_id] + ) + await self.adb_client.run_query_builder(builder) + + async def check_results(self): + for url_id in self.url_id_to_setup_record.keys(): + await self.check_via_url(url_id) + for data_source_id in self.ds_id_to_setup_record.keys(): + await self.check_via_data_source(data_source_id) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py new file mode 100644 index 00000000..80d5ee42 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py @@ -0,0 +1,50 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord + + +class CheckURLQueryBuilder(QueryBuilderBase): + + def __init__(self, record: TestURLPostSetupRecord): + super().__init__() + self.record = record + + async def run(self, session: AsyncSession) -> None: + """Check if url and associated properties match record. + Raises: + AssertionError: if url and associated properties do not match record + """ + query = ( + select(URL) + .options( + selectinload(URL.data_source), + selectinload(URL.confirmed_agencies), + ) + .join(URLDataSource, URL.id == URLDataSource.data_source_id) + .outerjoin(LinkURLAgency, URL.id == LinkURLAgency.url_id) + .join(Agency, LinkURLAgency.agency_id == Agency.agency_id) + ) + if self.record.url_id is not None: + query = query.where(URL.id == self.record.url_id) + if self.record.data_sources_id is not None: + query = query.where(URLDataSource.id == self.record.data_sources_id) + + raw_result = await session.execute(query) + result = raw_result.scalars().one_or_none() + assert result is not None + await self.check_results(result) + + async def check_results(self, url: URL): + assert url.record_type == self.record.final_record_type + assert url.description == self.record.final_description + assert url.name == self.record.final_name + agencies = [agency.agency_id for agency in url.confirmed_agencies] + assert set(agencies) == set(self.record.final_agency_ids) + assert url.outcome == self.record.final_url_status diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py new file mode 100644 index 00000000..92f52850 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py @@ -0,0 +1,95 @@ +from pendulum import today + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.agency import AgencyAssignmentManager +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import \ + TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class URLSetupFunctor: + + def __init__( + self, + entry: TestURLSetupEntry, + agency_assignment_manager: AgencyAssignmentManager, + adb_client: AsyncDatabaseClient + ): + self.adb_client = adb_client + self.agency_assignment_manager = agency_assignment_manager + self.prime_entry = entry + self.sc_agency_ids = None + self.ds_agency_ids = None + self.sc_url_id = None + self.ds_response_info = None + + async def __call__(self) -> TestURLPostSetupRecord: + await self.setup_entry() + return TestURLPostSetupRecord( + url_id=self.sc_url_id, + sc_setup_entry=self.prime_entry.sc_info, + ds_setup_entry=self.prime_entry.ds_info, + sc_agency_ids=self.sc_agency_ids, + ds_agency_ids=self.ds_agency_ids, + ds_response_info=self.ds_response_info, + final_url_status=self.prime_entry.final_url_status, + ) + + async def setup_entry(self): + if self.prime_entry.sc_info is not None: + self.sc_url_id = await self.setup_sc_entry(self.prime_entry.sc_info) + if self.prime_entry.ds_info is not None: + self.ds_response_info = await self.setup_ds_entry(self.prime_entry.ds_info) + + async def get_agency_ids(self, ags: list[AgencyAssigned]): + results = [] + for ag in ags: + results.append(await self.agency_assignment_manager.get(ag)) + return results + + async def setup_sc_entry( + self, + entry: TestSCURLSetupEntry + ) -> int: + """Set up source collector entry and return url id.""" + self.sc_agency_ids = await self.get_agency_ids(self.prime_entry.sc_info.agencies_assigned) + url = URL( + url=self.prime_entry.url, + name=entry.name, + description=entry.description, + collector_metadata={}, + outcome=entry.url_status.value, + record_type=entry.record_type.value if entry.record_type is not None else None, + ) + url_id = await self.adb_client.add(url, return_id=True) + links = [] + for ag_id in self.sc_agency_ids: + link = LinkURLAgency(url_id=url_id, agency_id=ag_id) + links.append(link) + await self.adb_client.add_all(links) + return url_id + + async def setup_ds_entry( + self, + ds_entry: TestDSURLSetupEntry + ) -> DataSourcesSyncResponseInnerInfo: + """Set up data source entry and return response info.""" + self.ds_agency_ids = await self.get_agency_ids(self.prime_entry.ds_info.agencies_assigned) + return DataSourcesSyncResponseInnerInfo( + id=ds_entry.id, + url=self.prime_entry.url, + name=ds_entry.name, + description=ds_entry.description, + url_status=ds_entry.url_status, + approval_status=ds_entry.approval_status, + record_type=ds_entry.record_type, + updated_at=today(), + agency_ids=self.ds_agency_ids + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py new file mode 100644 index 00000000..54360b35 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class TestURLSetupEntry(BaseModel): + url: str + ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB + sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB + + final_url_status: URLStatus diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py new file mode 100644 index 00000000..cadcfb4a --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned, SyncResponseOrder + + +class TestDSURLSetupEntry(BaseModel): + """Represents URL previously existing in DS DB. + + These values should overwrite any SC values + """ + id: int # ID of URL in DS App + name: str + description: str + url_status: DataSourcesURLStatus + approval_status: ApprovalStatus + record_type: RecordType + agencies_assigned: list[AgencyAssigned] + sync_response_order: SyncResponseOrder diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py new file mode 100644 index 00000000..b16233da --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py @@ -0,0 +1,50 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import \ + TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class TestURLPostSetupRecord(BaseModel): + """Stores a setup entry along with relevant database-generated ids""" + url_id: int | None + sc_setup_entry: TestSCURLSetupEntry | None + ds_setup_entry: TestDSURLSetupEntry | None + sc_agency_ids: list[int] | None + ds_agency_ids: list[int] | None + ds_response_info: DataSourcesSyncResponseInnerInfo | None + final_url_status: URLStatus + + @property + def data_sources_id(self) -> int | None: + if self.ds_setup_entry is None: + return None + return self.ds_setup_entry.id + + @property + def final_record_type(self) -> RecordType: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.record_type + return self.sc_setup_entry.record_type + + @property + def final_name(self) -> str: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.name + return self.sc_setup_entry.name + + @property + def final_description(self) -> str: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.description + return self.sc_setup_entry.description + + @property + def final_agency_ids(self) -> list[int] | None: + if self.ds_setup_entry is not None: + return self.ds_agency_ids + return self.sc_agency_ids \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py new file mode 100644 index 00000000..83092f7e --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned + + +class TestSCURLSetupEntry(BaseModel): + """Represents URL previously existing in SC DB. + + These values should be overridden by any DS values + """ + name: str + description: str + record_type: RecordType | None + url_status: URLStatus + agencies_assigned: list[AgencyAssigned] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py index 59594923..b0f98c3f 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py @@ -3,24 +3,34 @@ import pytest from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters -from src.db.models.instantiations.url.core.sqlalchemy import URL -from tests.automated.integration.tasks.scheduled.sync.agency.helpers import check_sync_concluded +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.info import TestDataSourcesSyncSetupInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager from tests.helpers.asserts import assert_task_run_success @pytest.mark.asyncio async def test_data_sources_sync_happy_path( - setup: TestDataSourcesSyncSetupInfo + test_operator: SyncDataSourcesTaskOperator ): - operator = setup.operator - adb_client = operator.adb_client + adb_client = test_operator.adb_client - with patch_sync_data_sources([setup.first_call_response, setup.second_call_response, setup.third_call_response]): - run_info = await operator.run_task(1) + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + with patch_sync_data_sources( + await manager.get_data_sources_sync_responses([order for order in SyncResponseOrder]) + ): + run_info = await test_operator.run_task(1) assert_task_run_success(run_info) - mock_func: MagicMock = operator.pdap_client.sync_data_sources + mock_func: MagicMock = test_operator.pdap_client.sync_data_sources mock_func.assert_has_calls( [ @@ -46,10 +56,9 @@ async def test_data_sources_sync_happy_path( ) await check_sync_concluded(adb_client, check_updated_at=False) - # Check six URLs in database - urls: list[URL] = await adb_client.get_all(URL) - assert len(urls) == 6 + # TODO: Fill in additional components + + # Check results according to expectations. + await manager.check_results() + - checker = URLChecker() - for url in urls: - checker.check_url(url) diff --git a/tests/automated/integration/tasks/url/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/auto_relevant/test_task.py index 6458c8a9..886cec09 100644 --- a/tests/automated/integration/tasks/url/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/auto_relevant/test_task.py @@ -1,5 +1,8 @@ +from collections import Counter + import pytest +from src.collectors.enums import URLStatus from src.db.enums import TaskType from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo @@ -28,8 +31,9 @@ async def test_url_auto_relevant_task(db_data_creator): # Get URLs, confirm one is marked as error urls: list[URL] = await adb_client.get_all(URL) assert len(urls) == 3 - statuses = [url.outcome for url in urls] - assert sorted(statuses) == sorted(["pending", "pending", "error"]) + counter = Counter([url.outcome for url in urls]) + assert counter[URLStatus.ERROR] == 1 + assert counter[URLStatus.PENDING] == 2 # Confirm two annotations were created suggestions: list[AutoRelevantSuggestion] = await adb_client.get_all(AutoRelevantSuggestion) diff --git a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py index 1ded4ba5..816724b8 100644 --- a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py +++ b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py @@ -68,7 +68,7 @@ async def test_url_duplicate_task( assert duplicate_url.url_id in url_ids for url in urls: if url.id == duplicate_url.url_id: - assert url.outcome == URLStatus.DUPLICATE.value + assert url.outcome == URLStatus.DUPLICATE checked_for_duplicates: list[URLCheckedForDuplicate] = await adb_client.get_all(URLCheckedForDuplicate) assert len(checked_for_duplicates) == 2 diff --git a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py index 3b3dd163..cfa2be99 100644 --- a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py @@ -139,9 +139,9 @@ async def test_submit_approved_url_task( url_3 = urls[2] # Check URLs have been marked as 'submitted' - assert url_1.outcome == URLStatus.SUBMITTED.value - assert url_2.outcome == URLStatus.SUBMITTED.value - assert url_3.outcome == URLStatus.ERROR.value + assert url_1.outcome == URLStatus.SUBMITTED + assert url_2.outcome == URLStatus.SUBMITTED + assert url_3.outcome == URLStatus.ERROR # Get URL Data Source Links url_data_sources = await db_data_creator.adb_client.get_all(URLDataSource) diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 2cc8294f..8966e416 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -126,10 +126,10 @@ def find_url(url_id: int) -> URL: return url raise Exception(f"URL with id {url_id} not found") - assert find_url(url_id_success).outcome == URLStatus.PENDING.value - assert find_url(url_id_404).outcome == URLStatus.NOT_FOUND.value - assert find_url(url_id_error).outcome == URLStatus.PENDING.value - assert find_url(url_id_initial_error).outcome == URLStatus.ERROR.value + assert find_url(url_id_success).outcome == URLStatus.PENDING + assert find_url(url_id_404).outcome == URLStatus.NOT_FOUND + assert find_url(url_id_error).outcome == URLStatus.PENDING + assert find_url(url_id_initial_error).outcome == URLStatus.ERROR # Check that meets_task_prerequisites now returns False meets_prereqs = await operator.meets_task_prerequisites() From 949b7edbacdc86f923574b403606f15ee2f8eea5 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 24 Jul 2025 09:00:41 -0400 Subject: [PATCH 4/6] Add session helper and continue work on query builder --- .../queries/get_annotation_batch_info.py | 2 +- .../get_next_url_for_user_annotation.py | 2 +- .../agency/get/queries/next_for_annotation.py | 4 +- src/api/endpoints/annotate/all/get/query.py | 2 +- src/api/endpoints/batch/duplicates/query.py | 2 +- src/api/endpoints/batch/urls/query.py | 2 +- src/api/endpoints/collector/manual/query.py | 2 +- .../metrics/batches/aggregated/query.py | 2 +- .../metrics/batches/breakdown/query.py | 2 +- src/api/endpoints/review/approve/query.py | 2 +- src/api/endpoints/review/next/query.py | 4 +- .../data_sources/queries/upsert_/__init__.py | 0 .../queries/{upsert.py => upsert_/core.py} | 65 ++++++++++- .../queries/upsert_/url_agency_link.py | 9 ++ ...pending_urls_without_agency_suggestions.py | 2 +- src/db/client/async_.py | 97 ++++------------ src/db/client/sync.py | 2 +- src/db/dto_converter.py | 2 +- .../link/{link_batch_urls.py => batch_url.py} | 0 .../link/{link_task_url.py => task_url.py} | 0 .../link/url_agency/__init__.py | 0 .../link/url_agency/pydantic.py | 6 + .../url_agency/sqlalchemy.py} | 0 src/db/queries/base/builder.py | 9 +- .../url_counts/builder.py | 2 +- src/db/session_helper.py | 107 ++++++++++++++++++ src/db/statement_composer.py | 6 +- .../test_approve_and_get_next_source.py | 2 +- .../integration/api/test_manual_batch.py | 2 +- .../db/client/approve_url/test_basic.py | 2 +- .../tasks/scheduled/sync/agency/helpers.py | 2 +- .../scheduled/sync/agency/test_happy_path.py | 1 + .../sync/data_sources/existence_checker.py | 2 +- .../setup/manager/queries/check.py | 2 +- .../sync/data_sources/setup/manager/url.py | 2 +- 35 files changed, 237 insertions(+), 111 deletions(-) create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert_/__init__.py rename src/core/tasks/scheduled/sync/data_sources/queries/{upsert.py => upsert_/core.py} (52%) create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py rename src/db/models/instantiations/link/{link_batch_urls.py => batch_url.py} (100%) rename src/db/models/instantiations/link/{link_task_url.py => task_url.py} (100%) create mode 100644 src/db/models/instantiations/link/url_agency/__init__.py create mode 100644 src/db/models/instantiations/link/url_agency/pydantic.py rename src/db/models/instantiations/{confirmed_url_agency.py => link/url_agency/sqlalchemy.py} (100%) create mode 100644 src/db/session_helper.py diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 1bab0fdf..31b858c5 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index 8cadb337..50b77d0a 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index d1c96769..1d1a1499 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,8 +9,8 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index 7ce8a94f..2db7191a 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -10,7 +10,7 @@ from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 389cfa8a..1f958a62 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -5,7 +5,7 @@ from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 40aa5935..c7b4d2ee 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 8008dc5b..03e2cc36 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -6,7 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py index c644a742..8d5f0f56 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query.py @@ -7,7 +7,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 36914e29..ad15c398 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -7,7 +7,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/review/approve/query.py b/src/api/endpoints/review/approve/query.py index 14d465bf..ea18dfb0 100644 --- a/src/api/endpoints/review/approve/query.py +++ b/src/api/endpoints/review/approve/query.py @@ -10,7 +10,7 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 2971dc16..1e8c4445 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -13,8 +13,8 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py similarity index 52% rename from src/core/tasks/scheduled/sync/data_sources/queries/upsert.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py index 164f5633..c70bcbec 100644 --- a/src/core/tasks/scheduled/sync/data_sources/queries/upsert.py +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py @@ -1,15 +1,74 @@ +from typing import final + +from sqlalchemy.ext.asyncio import AsyncSession +import src.db.session_helper as sh +from typing_extensions import override + from src.collectors.enums import URLStatus from src.db.models.instantiations.url.core.pydantic.upsert import URLUpsertModel from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus -# upsert_urls_from_data_sources - +@final class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): - def __init__(self): + def __init__(self, data_sources: list[DataSourcesSyncResponseInnerInfo]): super().__init__() + self.data_sources = data_sources + + @override + async def run(self, session: AsyncSession) -> None: + await self.upsert_urls(session=session) + await self.update_agency_links() + await self.update_url_data_sources() + + async def upsert_urls(self, session: AsyncSession): + results = [] + for data_source in self.data_sources: + results.append( + URLUpsertModel( + id=data_source.id, + name=data_source.name, + description=data_source.description, + outcome=_convert_to_source_collector_url_status( + ds_url_status=data_source.url_status, + ds_approval_status=data_source.approval_status + ), + record_type=data_source.record_type + ) + ) + await sh.bulk_upsert(session=session, models=results) + + async def update_agency_links(self) -> None: + """Overwrite existing url_agency links with new ones, if applicable.""" + for data_source in self.data_sources: + + # Get existing links + pass + # Get new links + pass + # Remove all links not in new links + pass + # Add new links + pass + + + async def update_url_data_sources(self) -> None: + # Get existing url-data sources attributes + pass + + # Get new url-data sources attributes + pass + + # Overwrite all existing url-data sources attributes that are not in new + pass + + # Add new url-data sources attributes + pass + + raise NotImplementedError + def convert_data_sources_sync_response_to_url_upsert( data_sources: list[DataSourcesSyncResponseInnerInfo] diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py new file mode 100644 index 00000000..84dda14d --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py @@ -0,0 +1,9 @@ +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyUpsertModel +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpsertQueryBuilder(QueryBuilderBase): + + def __init__(self, models: list[LinkURLAgencyUpsertModel]): + super().__init__() + self.models = models \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py index 327c2a9f..0c814cb2 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 7865a8e2..fe481742 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -3,8 +3,7 @@ from operator import or_ from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, text, Row -from sqlalchemy.dialects import postgresql +from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker @@ -64,7 +63,7 @@ from src.core.tasks.scheduled.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query from src.core.tasks.scheduled.sync.data_sources.queries.update_sync_progress import \ get_update_data_sources_sync_progress_query -from src.core.tasks.scheduled.sync.data_sources.queries.upsert import convert_data_sources_sync_response_to_url_upsert +from src.core.tasks.scheduled.sync.data_sources.queries.upsert_.core import convert_data_sources_sync_response_to_url_upsert from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ @@ -81,19 +80,12 @@ from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.db import session_helper as sh from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.dto_converter import DTOConverter -from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo -from src.db.models.instantiations.log.pydantic.info import LogInfo -from src.db.models.instantiations.log.pydantic.output import LogOutputInfo -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping @@ -101,20 +93,26 @@ from src.db.enums import TaskType from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.link.link_task_url import LinkTaskURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.task_url import LinkTaskURL +from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.instantiations.log.pydantic.output import LogOutputInfo from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.root_url_cache import RootURL -from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -123,6 +121,7 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.templates import Base @@ -186,13 +185,7 @@ async def add( model: Base, return_id: bool = False ) -> int | None: - session.add(model) - if return_id: - if not hasattr(model, "id"): - raise AttributeError("Models must have an id attribute") - await session.flush() - return model.id - return None + return await sh.add(session=session, model=model, return_id=return_id) @session_manager async def add_all( @@ -201,16 +194,7 @@ async def add_all( models: list[Base], return_ids: bool = False ) -> list[int] | None: - session.add_all(models) - if return_ids: - if not hasattr(models[0], "id"): - raise AttributeError("Models must have an id attribute") - await session.flush() - return [ - model.id # pyright: ignore [reportAttributeAccessIssue] - for model in models - ] - return None + return await sh.add_all(session=session, models=models, return_ids=return_ids) @session_manager async def bulk_update( @@ -231,45 +215,20 @@ async def bulk_upsert( session: AsyncSession, models: list[UpsertModel], ): - if len(models) == 0: - return - - first_model = models[0] - - query = pg_insert(first_model.sa_model) - - mappings = [upsert_model.model_dump() for upsert_model in models] - - set_ = {} - for k, v in mappings[0].items(): - if k == first_model.id_field: - continue - set_[k] = getattr(query.excluded, k) - - query = query.on_conflict_do_update( - index_elements=[first_model.id_field], - set_=set_ - ) - - - # Note, mapping must include primary key - await session.execute( - query, - mappings - ) + return await sh.bulk_upsert(session, models) @session_manager async def scalar(self, session: AsyncSession, statement): """Fetch the first column of the first row.""" - return (await session.execute(statement)).scalar() + return await sh.scalar(session, statement) @session_manager async def scalars(self, session: AsyncSession, statement): - return (await session.execute(statement)).scalars().all() + return await sh.scalars(session, statement) @session_manager async def mapping(self, session: AsyncSession, statement): - return (await session.execute(statement)).mappings().one() + return await sh.mapping(session, statement) @session_manager async def run_query_builder( @@ -615,15 +574,9 @@ async def get_all( model: Base, order_by_attribute: Optional[str] = None ) -> list[Base]: - """ - Get all records of a model - Used primarily in testing - """ - statement = select(model) - if order_by_attribute: - statement = statement.order_by(getattr(model, order_by_attribute)) - result = await session.execute(statement) - return result.scalars().all() + """Get all records of a model. Used primarily in testing.""" + return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) + @session_manager async def load_root_url_cache(self, session: AsyncSession) -> dict[str, str]: @@ -1341,10 +1294,6 @@ def case_column(status: URLStatus, label): oldest_pending_url_created_at=oldest_pending_created_at, ) - def compile(self, statement): - compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) - return compiled_sql - @session_manager async def get_urls_breakdown_pending_metrics( self, diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 558a8f18..7d435118 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -13,7 +13,7 @@ from src.db.models.instantiations.log.pydantic.info import LogInfo from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.templates import Base from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.log.sqlalchemy import Log diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index d640a851..4afa641e 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -9,7 +9,7 @@ from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo from src.db.dtos.url.with_html import URLWithHTML -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/db/models/instantiations/link/link_batch_urls.py b/src/db/models/instantiations/link/batch_url.py similarity index 100% rename from src/db/models/instantiations/link/link_batch_urls.py rename to src/db/models/instantiations/link/batch_url.py diff --git a/src/db/models/instantiations/link/link_task_url.py b/src/db/models/instantiations/link/task_url.py similarity index 100% rename from src/db/models/instantiations/link/link_task_url.py rename to src/db/models/instantiations/link/task_url.py diff --git a/src/db/models/instantiations/link/url_agency/__init__.py b/src/db/models/instantiations/link/url_agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/link/url_agency/pydantic.py b/src/db/models/instantiations/link/url_agency/pydantic.py new file mode 100644 index 00000000..f76aa30a --- /dev/null +++ b/src/db/models/instantiations/link/url_agency/pydantic.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class LinkURLAgencyUpsertModel(BaseModel): + url_id: int + agency_ids: list[int] \ No newline at end of file diff --git a/src/db/models/instantiations/confirmed_url_agency.py b/src/db/models/instantiations/link/url_agency/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/confirmed_url_agency.py rename to src/db/models/instantiations/link/url_agency/sqlalchemy.py diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index 5806ef47..1295fbd1 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -1,9 +1,9 @@ from typing import Any, Generic, Optional from sqlalchemy import FromClause, ColumnClause -from sqlalchemy.dialects import postgresql from sqlalchemy.ext.asyncio import AsyncSession +from src.db import session_helper as sh from src.db.types import LabelsType @@ -33,9 +33,4 @@ async def run(self, session: AsyncSession) -> Any: @staticmethod def compile(query) -> Any: - return query.compile( - dialect=postgresql.dialect(), - compile_kwargs={ - "literal_binds": True - } - ) + return sh.compile_to_sql(query) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 11a332dd..d1ab774e 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/session_helper.py b/src/db/session_helper.py new file mode 100644 index 00000000..f86d968d --- /dev/null +++ b/src/db/session_helper.py @@ -0,0 +1,107 @@ +""" +session_helper (aliased as sh) contains a number of convenience +functions for workings with a SQLAlchemy session +""" +from typing import Any, Optional + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from src.db.models.templates import Base +from src.db.templates.upsert import UpsertModel + + +async def scalar(session: AsyncSession, query: sa.Select) -> Any: + """Fetch the first column of the first row.""" + raw_result = await session.execute(query) + return raw_result.scalar() + +async def scalars(session: AsyncSession, query: sa.Select) -> Any: + raw_result = await session.execute(query) + return raw_result.scalars().all() + +async def mapping(session: AsyncSession, query: sa.Select) -> sa.RowMapping: + raw_result = await session.execute(query) + return raw_result.mappings().one() + + +async def bulk_upsert( + session: AsyncSession, + models: list[UpsertModel], +): + if len(models) == 0: + return + + first_model = models[0] + + query = pg_insert(first_model.sa_model) + + mappings = [upsert_model.model_dump() for upsert_model in models] + + set_ = {} + for k, v in mappings[0].items(): + if k == first_model.id_field: + continue + set_[k] = getattr(query.excluded, k) + + query = query.on_conflict_do_update( + index_elements=[first_model.id_field], + set_=set_ + ) + + # Note, mapping must include primary key + await session.execute( + query, + mappings + ) + +async def add( + session: AsyncSession, + model: Base, + return_id: bool = False +) -> int | None: + session.add(model) + if return_id: + if not hasattr(model, "id"): + raise AttributeError("Models must have an id attribute") + await session.flush() + return model.id + return None + + +async def add_all( + session: AsyncSession, + models: list[Base], + return_ids: bool = False +) -> list[int] | None: + session.add_all(models) + if return_ids: + if not hasattr(models[0], "id"): + raise AttributeError("Models must have an id attribute") + await session.flush() + return [ + model.id # pyright: ignore [reportAttributeAccessIssue] + for model in models + ] + return None + +async def get_all( + session: AsyncSession, + model: Base, + order_by_attribute: Optional[str] = None +) -> list[Base]: + """ + Get all records of a model + Used primarily in testing + """ + statement = sa.select(model) + if order_by_attribute: + statement = statement.order_by(getattr(model, order_by_attribute)) + result = await session.execute(statement) + return result.scalars().all() + +def compile_to_sql(statement) -> str: + compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) + return compiled_sql \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 91f4926f..dfac8c9c 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -7,9 +7,9 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.link.link_task_url import LinkTaskURL +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.task_url import LinkTaskURL from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index f706a6ee..4dcb3fdc 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,7 +6,7 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 8f51ab9c..bdf858f7 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,7 +2,7 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 59568266..7af3807c 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,7 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL diff --git a/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py index 6fe988a6..a60f0586 100644 --- a/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py @@ -21,7 +21,7 @@ async def check_sync_concluded( ) ) - sync_state_results = await db_client.mapping( + sync_state_results = await db_client.scalar( select( AgenciesSyncState ) diff --git a/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py index c7d6bca7..02cefa3e 100644 --- a/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py @@ -14,6 +14,7 @@ @pytest.mark.asyncio async def test_agency_sync_happy_path( + wiped_database, setup: SyncAgenciesTaskOperator ): operator = setup diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py index 64e0f742..22d5424d 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -1,6 +1,6 @@ from collections import defaultdict -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py index 80d5ee42..5cd8aeb4 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import selectinload from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source import URLDataSource from src.db.queries.base.builder import QueryBuilderBase diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py index 92f52850..8edbbf33 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py @@ -1,7 +1,7 @@ from pendulum import today from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.confirmed_url_agency import LinkURLAgency +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned From 39581259b2771f1fe8a2c368361d6788748c66ae Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 25 Jul 2025 20:31:37 -0400 Subject: [PATCH 5/6] Finish draft --- alembic/env.py | 2 +- apply_migrations.py | 3 +- .../agency/get/queries/next_for_annotation.py | 2 +- src/api/endpoints/batch/urls/dto.py | 2 +- src/api/endpoints/batch/urls/query.py | 2 +- src/api/endpoints/review/approve/query.py | 2 +- src/api/endpoints/review/next/query.py | 2 +- src/api/endpoints/task/by_id/dto.py | 2 +- src/api/endpoints/task/by_id/query.py | 2 +- src/core/preprocessors/autogoogler.py | 2 +- src/core/preprocessors/base.py | 2 +- src/core/preprocessors/ckan.py | 2 +- src/core/preprocessors/common_crawler.py | 2 +- src/core/preprocessors/example.py | 2 +- src/core/preprocessors/muckrock.py | 2 +- .../scheduled/sync/data_sources/operator.py | 2 +- .../{dtos/parameters.py => params.py} | 0 .../data_sources/queries/get_sync_params.py | 2 +- .../{dtos => queries/upsert}/__init__.py | 0 .../{upsert_ => upsert/agency}/__init__.py | 0 .../queries/upsert/agency/convert.py | 14 ++ .../queries/upsert/agency/core.py | 13 ++ .../queries/upsert/agency/params.py | 7 + .../queries/upsert/agency/query.py | 79 +++++++ .../sync/data_sources/queries/upsert/core.py | 94 ++++++++ .../queries/upsert/helpers}/__init__.py | 0 .../queries/upsert/helpers/convert.py | 64 ++++++ .../queries/upsert/helpers/filter.py | 29 +++ .../data_sources/queries/upsert/mapper.py | 13 ++ .../queries/upsert/param_manager.py | 101 +++++++++ .../data_sources/queries/upsert/requester.py | 78 +++++++ .../queries/upsert/url/__init__.py | 0 .../queries/upsert/url/insert/__init__.py | 0 .../queries/upsert/url/insert/params.py | 16 ++ .../queries/upsert/url/lookup/__init__.py | 0 .../queries/upsert/url/lookup/format.py | 7 + .../queries/upsert/url/lookup/query.py | 62 +++++ .../queries/upsert/url/lookup/response.py | 10 + .../queries/upsert/url/update/__init__.py | 0 .../queries/upsert/url/update/params.py | 21 ++ .../sync/data_sources/queries/upsert_/core.py | 117 ---------- .../queries/upsert_/url_agency_link.py | 9 - src/core/tasks/url/operators/url_html/core.py | 2 +- .../get_pending_urls_without_html_data.py | 2 +- src/core/tasks/url/operators/url_html/tdo.py | 2 +- src/db/client/async_.py | 32 ++- src/db/client/sync.py | 4 +- src/db/dto_converter.py | 2 +- src/db/helpers.py | 2 - src/db/helpers/__init__.py | 0 src/db/helpers/connect.py | 5 + src/db/helpers/session/__init__.py | 0 src/db/helpers/session/parser.py | 41 ++++ src/db/helpers/session/session_helper.py | 214 ++++++++++++++++++ src/db/helpers/session/types.py | 8 + .../instantiations/agency/pydantic/upsert.py | 12 +- .../link/url_agency/pydantic.py | 15 +- .../link/url_agency/sqlalchemy.py | 2 +- .../core/{pydantic/info.py => pydantic.py} | 0 .../url/core/pydantic/upsert.py | 23 -- .../url/data_source/__init__.py | 0 .../url/data_source/pydantic.py | 11 + .../sqlalchemy.py} | 0 src/db/queries/base/builder.py | 2 +- src/db/session_helper.py | 107 --------- src/db/statement_composer.py | 2 +- src/db/templates/markers/__init__.py | 0 src/db/templates/markers/bulk/__init__.py | 0 src/db/templates/markers/bulk/delete.py | 6 + src/db/templates/markers/bulk/insert.py | 5 + src/db/templates/markers/bulk/update.py | 5 + src/db/templates/markers/bulk/upsert.py | 5 + src/db/templates/protocols/__init__.py | 0 src/db/templates/protocols/has_id.py | 6 + .../protocols/sa_correlated/__init__.py | 0 .../templates/protocols/sa_correlated/core.py | 15 ++ .../protocols/sa_correlated/with_id.py | 20 ++ src/db/templates/upsert.py | 20 -- src/db/utils/validate.py | 13 ++ src/external/pdap/client.py | 2 +- src/external/pdap/dtos/sync/data_sources.py | 2 +- .../test_approve_and_get_next_source.py | 2 +- .../db/client/approve_url/test_basic.py | 2 +- .../db/client/test_delete_url_updated_at.py | 2 +- .../integration/db/client/test_insert_urls.py | 2 +- .../integration/db/structure/test_batch.py | 2 +- .../integration/db/structure/testers/table.py | 2 +- .../sync/data_sources/existence_checker.py | 2 +- .../scheduled/sync/data_sources/setup/data.py | 7 +- .../sync/data_sources/setup/manager/core.py | 19 +- .../setup/manager/queries/check.py | 16 +- .../sync/data_sources/setup/manager/url.py | 2 +- .../setup/models/url/data_sources.py | 2 +- .../sync/data_sources/test_happy_path.py | 4 +- .../sync/data_sources/test_interruption.py | 65 ++++++ .../sync/data_sources/test_no_new_results.py | 59 +++++ .../url/test_submit_approved_url_task.py | 2 +- tests/automated/unit/db/__init__.py | 0 tests/automated/unit/db/utils/__init__.py | 0 .../unit/db/utils/validate/__init__.py | 0 .../unit/db/utils/validate/mock/__init__.py | 0 .../unit/db/utils/validate/mock/class_.py | 10 + .../unit/db/utils/validate/mock/protocol.py | 7 + .../validate/test_all_models_of_same_type.py | 17 ++ .../db/utils/validate/test_has_protocol.py | 17 ++ .../test_autogoogler_collector.py | 2 +- .../test_common_crawl_collector.py | 2 +- .../test_muckrock_collectors.py | 2 +- tests/conftest.py | 7 +- tests/helpers/db_data_creator.py | 2 +- .../test_html_tag_collector_integration.py | 2 +- 111 files changed, 1251 insertions(+), 363 deletions(-) rename src/core/tasks/scheduled/sync/data_sources/{dtos/parameters.py => params.py} (100%) rename src/core/tasks/scheduled/sync/data_sources/{dtos => queries/upsert}/__init__.py (100%) rename src/core/tasks/scheduled/sync/data_sources/queries/{upsert_ => upsert/agency}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py rename src/{db/models/instantiations/url/core/pydantic => core/tasks/scheduled/sync/data_sources/queries/upsert/helpers}/__init__.py (100%) create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/__init__.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/__init__.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/__init__.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/__init__.py create mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py delete mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py delete mode 100644 src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py create mode 100644 src/db/helpers/__init__.py create mode 100644 src/db/helpers/connect.py create mode 100644 src/db/helpers/session/__init__.py create mode 100644 src/db/helpers/session/parser.py create mode 100644 src/db/helpers/session/session_helper.py create mode 100644 src/db/helpers/session/types.py rename src/db/models/instantiations/url/core/{pydantic/info.py => pydantic.py} (100%) delete mode 100644 src/db/models/instantiations/url/core/pydantic/upsert.py create mode 100644 src/db/models/instantiations/url/data_source/__init__.py create mode 100644 src/db/models/instantiations/url/data_source/pydantic.py rename src/db/models/instantiations/url/{data_source.py => data_source/sqlalchemy.py} (100%) delete mode 100644 src/db/session_helper.py create mode 100644 src/db/templates/markers/__init__.py create mode 100644 src/db/templates/markers/bulk/__init__.py create mode 100644 src/db/templates/markers/bulk/delete.py create mode 100644 src/db/templates/markers/bulk/insert.py create mode 100644 src/db/templates/markers/bulk/update.py create mode 100644 src/db/templates/markers/bulk/upsert.py create mode 100644 src/db/templates/protocols/__init__.py create mode 100644 src/db/templates/protocols/has_id.py create mode 100644 src/db/templates/protocols/sa_correlated/__init__.py create mode 100644 src/db/templates/protocols/sa_correlated/core.py create mode 100644 src/db/templates/protocols/sa_correlated/with_id.py delete mode 100644 src/db/templates/upsert.py create mode 100644 src/db/utils/validate.py create mode 100644 tests/automated/unit/db/__init__.py create mode 100644 tests/automated/unit/db/utils/__init__.py create mode 100644 tests/automated/unit/db/utils/validate/__init__.py create mode 100644 tests/automated/unit/db/utils/validate/mock/__init__.py create mode 100644 tests/automated/unit/db/utils/validate/mock/class_.py create mode 100644 tests/automated/unit/db/utils/validate/mock/protocol.py create mode 100644 tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py create mode 100644 tests/automated/unit/db/utils/validate/test_has_protocol.py diff --git a/alembic/env.py b/alembic/env.py index 3d305e32..2cf7e6c8 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -6,7 +6,7 @@ from sqlalchemy import engine_from_config from sqlalchemy import pool -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from src.db.models.templates import Base # this is the Alembic Config object, which provides diff --git a/apply_migrations.py b/apply_migrations.py index 6b3188f3..2b217c8b 100644 --- a/apply_migrations.py +++ b/apply_migrations.py @@ -1,7 +1,8 @@ from alembic import command from alembic.config import Config -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string + def apply_migrations(): print("Applying migrations...") diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 1d1a1499..27f7a382 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,8 +9,8 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 90f9b209..13e8659c 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index c7b4d2ee..49b95e13 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,7 +1,7 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/review/approve/query.py b/src/api/endpoints/review/approve/query.py index ea18dfb0..c2eb8cbf 100644 --- a/src/api/endpoints/review/approve/query.py +++ b/src/api/endpoints/review/approve/query.py @@ -10,7 +10,7 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 1e8c4445..0ec83dc1 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -13,8 +13,8 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index 65fa74c5..9213aa90 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.enums import TaskType from src.core.enums import BatchStatus diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index c2b32234..8133085f 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.db.models.instantiations.task.core import Task diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index 8163115c..460cf0e0 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index 2f777d5f..beb31cb7 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index 0b1cef2e..b72ee3c9 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 57457ed4..16f5d730 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index e357d2a2..691d23c6 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,6 +1,6 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index 7952ee56..b42a198f 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,6 +1,6 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/tasks/scheduled/sync/data_sources/operator.py b/src/core/tasks/scheduled/sync/data_sources/operator.py index 57b12663..a88fc34a 100644 --- a/src/core/tasks/scheduled/sync/data_sources/operator.py +++ b/src/core/tasks/scheduled/sync/data_sources/operator.py @@ -1,6 +1,6 @@ from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.core.tasks.scheduled.sync.check import check_max_sync_requests_not_exceeded -from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py b/src/core/tasks/scheduled/sync/data_sources/params.py similarity index 100% rename from src/core/tasks/scheduled/sync/data_sources/dtos/parameters.py rename to src/core/tasks/scheduled/sync/data_sources/params.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py index 4f2efe06..695813c6 100644 --- a/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py +++ b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py @@ -2,7 +2,7 @@ from sqlalchemy.exc import NoResultFound from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/sync/data_sources/dtos/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/__init__.py similarity index 100% rename from src/core/tasks/scheduled/sync/data_sources/dtos/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/__init__.py similarity index 100% rename from src/core/tasks/scheduled/sync/data_sources/queries/upsert_/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py new file mode 100644 index 00000000..05b6ec75 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py @@ -0,0 +1,14 @@ +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic + + +def convert_to_link_url_agency_models( + url_id: int, + agency_ids: list[int] +) -> list[LinkURLAgencyPydantic]: + return [ + LinkURLAgencyPydantic( + url_id=url_id, + agency_id=agency_id + ) + for agency_id in agency_ids + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py new file mode 100644 index 00000000..e1820898 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py @@ -0,0 +1,13 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.query import URLAgencyLinkUpdateQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams + + +async def update_agency_links( + session: AsyncSession, + params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] +) -> None: + """Overwrite existing url_agency links with new ones, if applicable.""" + query = URLAgencyLinkUpdateQueryBuilder(params) + await query.run(session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py new file mode 100644 index 00000000..d43bbbd8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class UpdateLinkURLAgencyForDataSourcesSyncParams(BaseModel): + url_id: int + new_agency_ids: list[int] + old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py new file mode 100644 index 00000000..4850be39 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py @@ -0,0 +1,79 @@ +from collections import defaultdict + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): + """Given a set of URL-Agency links, remove all non-matching links and add new ones.""" + + + def __init__(self, models: list[UpdateLinkURLAgencyForDataSourcesSyncParams]): + super().__init__() + self.models = models + self._new_links: dict[int, list[int]] = { + model.url_id: model.new_agency_ids + for model in self.models + } + self._existing_links: dict[int, list[int]] = defaultdict(list) + self.existing_url_ids = {model.url_id for model in self.models} + + async def _get_existing_links(self, session: AsyncSession): + """Get existing agency links for provided URLs. + + Modifies: + self._existing_links + """ + query = ( + select(LinkURLAgency) + .where( + LinkURLAgency.url_id.in_( + self.existing_url_ids + ) + ) + ) + links = await session.scalars(query) + for link in links: + self._existing_links[link.url_id].append(link.agency_id) + + async def _update_links(self, session: AsyncSession): + # Remove all existing links not in new links + links_to_delete: list[LinkURLAgencyPydantic] = [] + links_to_insert: list[LinkURLAgencyPydantic] = [] + + for url_id in self.existing_url_ids: + new_agency_ids = self._new_links.get(url_id, []) + existing_agency_ids = self._existing_links.get(url_id, []) + # IDs to delete are existing agency ids that are not new agency ids + ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) + # IDs to insert are new agency ids that are not existing agency ids + ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) + + links_to_delete.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_delete) + ) + ) + links_to_insert.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_insert) + ) + ) + + await sh.bulk_delete(session=session, models=links_to_delete) + await sh.bulk_insert(session=session, models=links_to_insert) + + async def run(self, session: AsyncSession): + await self._get_existing_links(session=session) + await self._update_links(session=session) + + diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py new file mode 100644 index 00000000..a0517b45 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py @@ -0,0 +1,94 @@ +from typing import final + +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ + get_mappings_for_urls_without_data_sources +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.param_manager import \ + UpsertURLsFromDataSourcesParamManager +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.requester import UpsertURLsFromDataSourcesDBRequester +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.db.dtos.url.mapping import URLMapping +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + + +@final +class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): + + def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): + super().__init__() + self.sync_infos = sync_infos + self.urls = {sync_info.url for sync_info in self.sync_infos} + self.param_manager = UpsertURLsFromDataSourcesParamManager( + mapper=URLSyncInfoMapper(self.sync_infos) + ) + self._session: AsyncSession | None = None + self._requester: UpsertURLsFromDataSourcesDBRequester | None = None + # Need to be able to add URL ids first before adding links or other attributes + + @property + def requester(self) -> UpsertURLsFromDataSourcesDBRequester: + """ + Modifies: + self._requester + """ + if self._requester is None: + self._requester = UpsertURLsFromDataSourcesDBRequester(self._session) + return self._requester + + @override + async def run(self, session: AsyncSession) -> None: + """ + Modifies: + self._session + """ + self._session = session + + lookup_results = await self._lookup_urls() + lookups_existing_urls = filter_for_urls_with_ids(lookup_results) + await self._update_existing_urls(lookups_existing_urls) + await self._update_agency_link(lookups_existing_urls) + mappings_without_data_sources = get_mappings_for_urls_without_data_sources(lookup_results) + await self._add_new_data_sources(mappings_without_data_sources) + + extant_urls = {lookup.url_info.url for lookup in lookups_existing_urls} + urls_to_add = list(self.urls - extant_urls) + if len(urls_to_add) == 0: + return + url_mappings = await self._add_new_urls(urls_to_add) + await self._add_new_data_sources(url_mappings) + await self._insert_agency_link(url_mappings) + + async def _lookup_urls(self): + lookup_results = await self.requester.lookup_urls(list(self.urls)) + return lookup_results + + async def _insert_agency_link(self, url_mappings: list[URLMapping]): + link_url_agency_insert_params = self.param_manager.insert_agency_link( + url_mappings + ) + await self.requester.add_new_agency_links(link_url_agency_insert_params) + + async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + link_url_agency_update_params = self.param_manager.update_agency_link( + lookups_existing_urls + ) + await self.requester.update_agency_links(link_url_agency_update_params) + + async def _add_new_data_sources(self, url_mappings: list[URLMapping]): + url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) + await self.requester.add_new_data_sources(url_ds_insert_params) + + async def _add_new_urls(self, urls: list[str]): + url_insert_params = self.param_manager.add_new_urls(urls) + url_mappings = await self.requester.add_new_urls(url_insert_params) + return url_mappings + + async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + update_params = self.param_manager.update_existing_urls(lookups_existing_urls) + await self.requester.update_existing_urls(update_params) + diff --git a/src/db/models/instantiations/url/core/pydantic/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/__init__.py similarity index 100% rename from src/db/models/instantiations/url/core/pydantic/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py new file mode 100644 index 00000000..10a05d8e --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py @@ -0,0 +1,64 @@ +from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus + + +def convert_to_source_collector_url_status( + ds_url_status: DataSourcesURLStatus, + ds_approval_status: ApprovalStatus +) -> URLStatus: + match ds_url_status: + case DataSourcesURLStatus.AVAILABLE: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.NONE_FOUND: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.BROKEN: + return URLStatus.NOT_FOUND + case _: + pass + + match ds_approval_status: + case ApprovalStatus.APPROVED: + return URLStatus.VALIDATED + case ApprovalStatus.REJECTED: + return URLStatus.NOT_RELEVANT + case ApprovalStatus.NEEDS_IDENTIFICATION: + return URLStatus.PENDING + case ApprovalStatus.PENDING: + return URLStatus.PENDING + case _: + raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") + +def convert_to_url_update_params( + url_id: int, + sync_info: DataSourcesSyncResponseInnerInfo +) -> UpdateURLForDataSourcesSyncParams: + return UpdateURLForDataSourcesSyncParams( + id=url_id, + name=sync_info.name, + description=sync_info.description, + outcome=convert_to_source_collector_url_status( + ds_url_status=sync_info.url_status, + ds_approval_status=sync_info.approval_status + ), + record_type=sync_info.record_type + ) + +def convert_to_url_insert_params( + url: str, + sync_info: DataSourcesSyncResponseInnerInfo +) -> InsertURLForDataSourcesSyncParams: + return InsertURLForDataSourcesSyncParams( + url=url, + name=sync_info.name, + description=sync_info.description, + outcome=convert_to_source_collector_url_status( + ds_url_status=sync_info.url_status, + ds_approval_status=sync_info.approval_status + ), + record_type=sync_info.record_type + ) diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py new file mode 100644 index 00000000..ef23fcd2 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py @@ -0,0 +1,29 @@ +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.db.dtos.url.mapping import URLMapping + + +def filter_for_urls_with_ids( + lookup_results: list[LookupURLForDataSourcesSyncResponse] +) -> list[LookupURLForDataSourcesSyncResponse]: + return [ + lookup_result + for lookup_result in lookup_results + if lookup_result.url_info.url_id is not None + ] + +def get_mappings_for_urls_without_data_sources( + lookup_results: list[LookupURLForDataSourcesSyncResponse] +) -> list[URLMapping]: + lookups_without_data_sources = [ + lookup_result + for lookup_result in lookup_results + if lookup_result.data_source_id is None + ] + return [ + URLMapping( + url_id=lookup_result.url_info.url_id, + url=lookup_result.url_info.url + ) + for lookup_result in lookups_without_data_sources + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py new file mode 100644 index 00000000..a60904a0 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py @@ -0,0 +1,13 @@ +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + + +class URLSyncInfoMapper: + + def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): + self._dict: dict[str, DataSourcesSyncResponseInnerInfo] = { + sync_info.url: sync_info + for sync_info in sync_infos + } + + def get(self, url: str) -> DataSourcesSyncResponseInnerInfo: + return self._dict[url] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py new file mode 100644 index 00000000..19d8a0cd --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py @@ -0,0 +1,101 @@ +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import \ + UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ + convert_to_url_insert_params +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic + + +class UpsertURLsFromDataSourcesParamManager: + def __init__( + self, + mapper: URLSyncInfoMapper + ): + self._mapper = mapper + + def update_existing_urls( + self, + lookup_results: list[LookupURLForDataSourcesSyncResponse] + ) -> list[UpdateURLForDataSourcesSyncParams]: + results = [] + for lookup_result in lookup_results: + url_info = lookup_result.url_info + sync_info = self._mapper.get(url_info.url) + update_params = convert_to_url_update_params( + url_id=url_info.url_id, + sync_info=sync_info + ) + results.append(update_params) + return results + + def add_new_urls( + self, + urls: list[str] + ) -> list[InsertURLForDataSourcesSyncParams]: + results = [] + for url in urls: + sync_info = self._mapper.get(url) + insert_params = convert_to_url_insert_params( + url=url, + sync_info=sync_info + ) + results.append(insert_params) + return results + + def update_agency_link( + self, + lookup_results: list[LookupURLForDataSourcesSyncResponse] + ) -> list[UpdateLinkURLAgencyForDataSourcesSyncParams]: + results = [] + for lookup_result in lookup_results: + url_info = lookup_result.url_info + sync_info = self._mapper.get(url_info.url) + update_params = UpdateLinkURLAgencyForDataSourcesSyncParams( + url_id=url_info.url_id, + new_agency_ids=sync_info.agency_ids, + old_agency_ids=url_info.agency_ids + ) + results.append(update_params) + return results + + def insert_agency_link( + self, + url_mappings: list[URLMapping] + ) -> list[LinkURLAgencyPydantic]: + results = [] + for mapping in url_mappings: + sync_info = self._mapper.get(mapping.url) + for agency_id in sync_info.agency_ids: + results.append( + LinkURLAgencyPydantic( + url_id=mapping.url_id, + agency_id=agency_id + ) + ) + + return results + + def add_new_data_sources( + self, + mappings: list[URLMapping] + ) -> list[URLDataSourcePydantic]: + results = [] + for mapping in mappings: + sync_info = self._mapper.get(mapping.url) + results.append( + URLDataSourcePydantic( + data_source_id=sync_info.id, + url_id=mapping.url_id + ) + ) + return results + + diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py new file mode 100644 index 00000000..14a73ce8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py @@ -0,0 +1,78 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import \ + UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.query import \ + URLAgencyLinkUpdateQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.query import \ + LookupURLForDataSourcesSyncQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic + + +class UpsertURLsFromDataSourcesDBRequester: + + def __init__(self, session: AsyncSession): + self.session = session + + + async def add_new_urls( + self, + params: list[InsertURLForDataSourcesSyncParams] + ): + url_ids = await sh.bulk_insert( + session=self.session, + models=params, + return_ids=True + ) + results = [] + for insert_param, url_id in zip(params, url_ids): + results.append( + URLMapping( + url=insert_param.url, + url_id=url_id, + ) + ) + return results + + async def lookup_urls( + self, + urls: list[str], + ) -> list[LookupURLForDataSourcesSyncResponse]: + """Lookup URLs for data source sync-relevant information.""" + builder = LookupURLForDataSourcesSyncQueryBuilder(urls=urls) + return await builder.run(session=self.session) + + async def update_existing_urls( + self, + params: list[UpdateURLForDataSourcesSyncParams], + ) -> None: + await sh.bulk_update(session=self.session, models=params) + + async def add_new_data_sources( + self, + params: list[URLDataSourcePydantic] + ) -> None: + await sh.bulk_insert(session=self.session, models=params) + + async def add_new_agency_links( + self, + params: list[LinkURLAgencyPydantic] + ): + await sh.bulk_insert(session=self.session, models=params) + + async def update_agency_links( + self, + params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] + ) -> None: + """Overwrite existing url_agency links with new ones, if applicable.""" + query = URLAgencyLinkUpdateQueryBuilder(params) + await query.run(self.session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py new file mode 100644 index 00000000..1cab6e0d --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py @@ -0,0 +1,16 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class InsertURLForDataSourcesSyncParams(BulkInsertableModel): + url: str + name: str + description: str | None + outcome: URLStatus + record_type: RecordType + + @classmethod + def sa_model(cls) -> type[URL]: + return URL \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py new file mode 100644 index 00000000..027cf3c3 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py @@ -0,0 +1,7 @@ + + + +def format_agency_ids_result(agency_ids: list[int | None]) -> list[int]: + if agency_ids == [None]: + return [] + return agency_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py new file mode 100644 index 00000000..f24c84ae --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py @@ -0,0 +1,62 @@ +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.format import format_agency_ids_result +from src.db.helpers.session import session_helper as sh +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse, URLDataSyncInfo +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupURLForDataSourcesSyncQueryBuilder(QueryBuilderBase): + """Look up provided URLs for corresponding database entries.""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[LookupURLForDataSourcesSyncResponse]: + url_id_label = "url_id" + data_source_id_label = "data_source_id" + agency_ids_label = "agency_ids" + + query = ( + select( + URL.url, + URL.id.label(url_id_label), + URLDataSource.data_source_id.label(data_source_id_label), + func.json_agg(LinkURLAgency.agency_id).label(agency_ids_label) + ).select_from(URL) + .outerjoin(URLDataSource) + .outerjoin(LinkURLAgency) + .where( + URL.url.in_( + self.urls + ) + ) + .group_by( + URL.url, + URL.id, + URLDataSource.data_source_id + ) + ) + + db_results = await sh.mappings(session=session, query=query) + + final_results = [] + for db_result in db_results: + final_results.append( + LookupURLForDataSourcesSyncResponse( + data_source_id=db_result[data_source_id_label], + url_info=URLDataSyncInfo( + url=db_result["url"], + url_id=db_result[url_id_label], + agency_ids=format_agency_ids_result(db_result[agency_ids_label]) + ) + ) + ) + + return final_results diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py new file mode 100644 index 00000000..845a6589 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +class URLDataSyncInfo(BaseModel): + url: str + url_id: int + agency_ids: list[int] + +class LookupURLForDataSourcesSyncResponse(BaseModel): + data_source_id: int | None + url_info: URLDataSyncInfo | None diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py new file mode 100644 index 00000000..fb8a9d64 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py @@ -0,0 +1,21 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.templates.markers.bulk.update import BulkUpdatableModel + + +class UpdateURLForDataSourcesSyncParams(BulkUpdatableModel): + + @classmethod + def id_field(cls) -> str: + return "id" + + @classmethod + def sa_model(cls) -> type[URL]: + return URL + + id: int + name: str + description: str | None + outcome: URLStatus + record_type: RecordType diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py deleted file mode 100644 index c70bcbec..00000000 --- a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/core.py +++ /dev/null @@ -1,117 +0,0 @@ -from typing import final - -from sqlalchemy.ext.asyncio import AsyncSession -import src.db.session_helper as sh -from typing_extensions import override - -from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.pydantic.upsert import URLUpsertModel -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus - -@final -class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): - - def __init__(self, data_sources: list[DataSourcesSyncResponseInnerInfo]): - super().__init__() - self.data_sources = data_sources - - @override - async def run(self, session: AsyncSession) -> None: - await self.upsert_urls(session=session) - await self.update_agency_links() - await self.update_url_data_sources() - - async def upsert_urls(self, session: AsyncSession): - results = [] - for data_source in self.data_sources: - results.append( - URLUpsertModel( - id=data_source.id, - name=data_source.name, - description=data_source.description, - outcome=_convert_to_source_collector_url_status( - ds_url_status=data_source.url_status, - ds_approval_status=data_source.approval_status - ), - record_type=data_source.record_type - ) - ) - await sh.bulk_upsert(session=session, models=results) - - async def update_agency_links(self) -> None: - """Overwrite existing url_agency links with new ones, if applicable.""" - for data_source in self.data_sources: - - # Get existing links - pass - # Get new links - pass - # Remove all links not in new links - pass - # Add new links - pass - - - async def update_url_data_sources(self) -> None: - # Get existing url-data sources attributes - pass - - # Get new url-data sources attributes - pass - - # Overwrite all existing url-data sources attributes that are not in new - pass - - # Add new url-data sources attributes - pass - - raise NotImplementedError - - -def convert_data_sources_sync_response_to_url_upsert( - data_sources: list[DataSourcesSyncResponseInnerInfo] -) -> list[URLUpsertModel]: - results = [] - for data_source in data_sources: - results.append( - URLUpsertModel( - id=data_source.id, - name=data_source.name, - description=data_source.description, - outcome=_convert_to_source_collector_url_status( - ds_url_status=data_source.url_status, - ds_approval_status=data_source.approval_status - ), - record_type=data_source.record_type - ) - ) - return results - - -def _convert_to_source_collector_url_status( - ds_url_status: DataSourcesURLStatus, - ds_approval_status: ApprovalStatus -) -> URLStatus: - match ds_url_status: - case DataSourcesURLStatus.AVAILABLE: - raise NotImplementedError("Logic not implemented for this status.") - case DataSourcesURLStatus.NONE_FOUND: - raise NotImplementedError("Logic not implemented for this status.") - case DataSourcesURLStatus.BROKEN: - return URLStatus.NOT_FOUND - case _: - pass - - match ds_approval_status: - case ApprovalStatus.APPROVED: - return URLStatus.VALIDATED - case ApprovalStatus.REJECTED: - return URLStatus.NOT_RELEVANT - case ApprovalStatus.NEEDS_IDENTIFICATION: - return URLStatus.PENDING - case ApprovalStatus.PENDING: - return URLStatus.PENDING - case _: - raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py deleted file mode 100644 index 84dda14d..00000000 --- a/src/core/tasks/scheduled/sync/data_sources/queries/upsert_/url_agency_link.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyUpsertModel -from src.db.queries.base.builder import QueryBuilderBase - - -class URLAgencyLinkUpsertQueryBuilder(QueryBuilderBase): - - def __init__(self, models: list[LinkURLAgencyUpsertModel]): - super().__init__() - self.models = models \ No newline at end of file diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/url_html/core.py index 091a1c10..39a09546 100644 --- a/src/core/tasks/url/operators/url_html/core.py +++ b/src/core/tasks/url/operators/url_html/core.py @@ -2,7 +2,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.core.tasks.url.operators.url_html.tdo import UrlHtmlTDO diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py index 70d2f6a3..ff7f7c10 100644 --- a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/url_html/tdo.py b/src/core/tasks/url/operators/url_html/tdo.py index f40c9bc2..326412a3 100644 --- a/src/core/tasks/url/operators/url_html/tdo.py +++ b/src/core/tasks/url/operators/url_html/tdo.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo diff --git a/src/db/client/async_.py b/src/db/client/async_.py index fe481742..fe4a498e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -58,12 +58,13 @@ from src.core.tasks.scheduled.sync.agency.queries.update_sync_progress import get_update_agencies_sync_progress_query from src.core.tasks.scheduled.sync.agency.queries.upsert import \ convert_agencies_sync_response_to_agencies_upsert -from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.sync.data_sources.queries.get_sync_params import GetDataSourcesSyncParametersQueryBuilder from src.core.tasks.scheduled.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query from src.core.tasks.scheduled.sync.data_sources.queries.update_sync_progress import \ get_update_data_sources_sync_progress_query -from src.core.tasks.scheduled.sync.data_sources.queries.upsert_.core import convert_data_sources_sync_response_to_url_upsert +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.core import \ + UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ @@ -80,7 +81,7 @@ from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO -from src.db import session_helper as sh +from src.db.helpers.session import session_helper as sh from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager @@ -95,12 +96,12 @@ from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.link.task_url import LinkTaskURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.log.pydantic.info import LogInfo from src.db.models.instantiations.log.pydantic.output import LogOutputInfo from src.db.models.instantiations.log.sqlalchemy import Log @@ -109,9 +110,9 @@ from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.html_content import URLHTMLContent @@ -131,7 +132,8 @@ from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder from src.db.statement_composer import StatementComposer -from src.db.templates.upsert import UpsertModel +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo @@ -213,10 +215,18 @@ async def bulk_update( async def bulk_upsert( self, session: AsyncSession, - models: list[UpsertModel], + models: list[BulkUpsertableModel], ): return await sh.bulk_upsert(session, models) + @session_manager + async def bulk_delete( + self, + session: AsyncSession, + models: list[BulkDeletableModel], + ): + return await sh.bulk_delete(session, models) + @session_manager async def scalar(self, session: AsyncSession, statement): """Fetch the first column of the first row.""" @@ -1582,8 +1592,10 @@ async def upsert_urls_from_data_sources( self, data_sources: list[DataSourcesSyncResponseInnerInfo] ): - await self.bulk_upsert( - models=convert_data_sources_sync_response_to_url_upsert(data_sources) + await self.run_query_builder( + UpsertURLsFromDataSourcesQueryBuilder( + sync_infos=data_sources + ) ) async def update_agencies_sync_progress(self, page: int): diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 7d435118..361cb25a 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -11,13 +11,13 @@ from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.log.pydantic.info import LogInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.templates import Base from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.log.sqlalchemy import Log -from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 4afa641e..ed2d361c 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -9,7 +9,7 @@ from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo from src.db.dtos.url.with_html import URLWithHTML -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/src/db/helpers.py b/src/db/helpers.py index 618b2e6d..10151935 100644 --- a/src/db/helpers.py +++ b/src/db/helpers.py @@ -1,5 +1,3 @@ from src.core.env_var_manager import EnvVarManager -def get_postgres_connection_string(is_async = False): - return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/src/db/helpers/__init__.py b/src/db/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/helpers/connect.py b/src/db/helpers/connect.py new file mode 100644 index 00000000..618b2e6d --- /dev/null +++ b/src/db/helpers/connect.py @@ -0,0 +1,5 @@ +from src.core.env_var_manager import EnvVarManager + + +def get_postgres_connection_string(is_async = False): + return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/src/db/helpers/session/__init__.py b/src/db/helpers/session/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/helpers/session/parser.py b/src/db/helpers/session/parser.py new file mode 100644 index 00000000..bc822022 --- /dev/null +++ b/src/db/helpers/session/parser.py @@ -0,0 +1,41 @@ +from src.db.helpers.session.types import BulkActionType +from src.db.models.templates import Base +from src.db.templates.protocols.sa_correlated.core import SQLAlchemyCorrelatedProtocol +from src.db.templates.protocols.sa_correlated.with_id import SQLAlchemyCorrelatedWithIDProtocol +from src.db.utils.validate import validate_all_models_of_same_type + + +class BulkActionParser: + + def __init__( + self, + models: list[BulkActionType], + ): + validate_all_models_of_same_type(models) + model_class = type(models[0]) + self.models = models + self.model_class = model_class + + @property + def id_field(self) -> str: + if not issubclass(self.model_class, SQLAlchemyCorrelatedWithIDProtocol): + raise TypeError("Model must implement SQLAlchemyCorrelatedWithID protocol.") + + return self.model_class.id_field() + + @property + def sa_model(self) -> type[Base]: + if not issubclass(self.model_class, SQLAlchemyCorrelatedProtocol): + raise TypeError(f"Model {self.model_class} must implement SQLAlchemyCorrelated protocol.") + return self.model_class.sa_model() + + def get_non_id_fields(self) -> list[str]: + return [ + field for field in self.model_class.model_fields.keys() + if field != self.id_field + ] + + def get_all_fields(self) -> list[str]: + return [ + field for field in self.model_class.model_fields.keys() + ] diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py new file mode 100644 index 00000000..2b3776c1 --- /dev/null +++ b/src/db/helpers/session/session_helper.py @@ -0,0 +1,214 @@ +""" +session_helper (aliased as sh) contains a number of convenience +functions for workings with a SQLAlchemy session +""" +from typing import Any, Optional, Sequence + +import sqlalchemy as sa +from sqlalchemy import update, ColumnElement, Row +from sqlalchemy.dialects import postgresql +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session.parser import BulkActionParser +from src.db.models.templates import Base, StandardBase +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.update import BulkUpdatableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel +from src.db.templates.protocols.has_id import HasIDProtocol + + +async def one_or_none( + session: AsyncSession, + query: sa.Select +) -> sa.Row | None: + raw_result = await session.execute(query) + return raw_result.scalars().one_or_none() + +async def scalar(session: AsyncSession, query: sa.Select) -> Any: + """Fetch the first column of the first row.""" + raw_result = await session.execute(query) + return raw_result.scalar() + +async def scalars(session: AsyncSession, query: sa.Select) -> Any: + raw_result = await session.execute(query) + return raw_result.scalars().all() + +async def mapping(session: AsyncSession, query: sa.Select) -> sa.RowMapping: + raw_result = await session.execute(query) + return raw_result.mappings().one() + +async def mappings(session: AsyncSession, query: sa.Select) -> Sequence[sa.RowMapping]: + raw_result = await session.execute(query) + return raw_result.mappings().all() + +async def bulk_upsert( + session: AsyncSession, + models: list[BulkUpsertableModel], +): + if len(models) == 0: + return + parser = BulkActionParser(models) + + query = pg_insert(parser.sa_model) + + upsert_mappings = [upsert_model.model_dump() for upsert_model in models] + + set_ = {} + for k, v in upsert_mappings[0].items(): + if k == parser.id_field: + continue + set_[k] = getattr(query.excluded, k) + + query = query.on_conflict_do_update( + index_elements=[parser.id_field], + set_=set_ + ) + + # Note, mapping must include primary key + await session.execute( + statement=query, + params=upsert_mappings + ) + +async def add( + session: AsyncSession, + model: Base, + return_id: bool = False +) -> int | None: + session.add(model) + if return_id: + if not isinstance(model, HasIDProtocol): + raise AttributeError("Models must have an id attribute") + await session.flush() + return model.id + return None + + +async def add_all( + session: AsyncSession, + models: list[StandardBase], + return_ids: bool = False +) -> list[int] | None: + session.add_all(models) + if return_ids: + if not isinstance(models[0], HasIDProtocol): + raise AttributeError("Models must have an id attribute") + await session.flush() + return [ + model.id # pyright: ignore [reportAttributeAccessIssue] + for model in models + ] + return None + +async def get_all( + session: AsyncSession, + model: Base, + order_by_attribute: Optional[str] = None +) -> Sequence[Row]: + """ + Get all records of a model + Used primarily in testing + """ + statement = sa.select(model) + if order_by_attribute: + statement = statement.order_by(getattr(model, order_by_attribute)) + result = await session.execute(statement) + return result.scalars().all() + +def compile_to_sql(statement) -> str: + compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) + return compiled_sql + + +async def bulk_delete(session: AsyncSession, models: list[BulkDeletableModel]): + """Bulk delete sqlalchemy models of the same type.""" + if len(models) == 0: + return + + parser = BulkActionParser(models) + + # Use declared field names from the model (excludes properties/methods) + field_names = parser.get_all_fields() + + sa_model = parser.sa_model + + # Get value tuples to be used in identifying attributes for bulk delete + value_tuples = [] + for model in models: + tup = tuple(getattr(model, field) for field in field_names) + value_tuples.append(tup) + + + statement = ( + sa.delete( + sa_model + ).where( + sa.tuple_( + *[ + getattr(sa_model, attr) + for attr in field_names + ] + ).in_(value_tuples) + ) + ) + + await session.execute(statement) + +async def bulk_insert( + session: AsyncSession, + models: list[BulkInsertableModel], + return_ids: bool = False +) -> list[int] | None: + """Bulk insert sqlalchemy models via their pydantic counterparts.""" + + if len(models) == 0: + return None + + parser = BulkActionParser(models) + sa_model = parser.sa_model + + models_to_add = [] + for model in models: + sa_model_instance = sa_model(**model.model_dump()) + models_to_add.append(sa_model_instance) + + return await add_all( + session=session, + models=models_to_add, + return_ids=return_ids + ) + +async def bulk_update( + session: AsyncSession, + models: list[BulkUpdatableModel], +): + """Bulk update sqlalchemy models via their pydantic counterparts.""" + if len(models) == 0: + return + + parser = BulkActionParser(models) + + sa_model = parser.sa_model + id_field = parser.id_field + update_fields = parser.get_non_id_fields() + + + for model in models: + update_values = { + k: getattr(model, k) + for k in update_fields + } + id_value = getattr(model, id_field) + id_attr: ColumnElement = getattr(sa_model, id_field) + stmt = ( + update(sa_model) + .where( + id_attr == id_value + ) + .values(**update_values) + ) + await session.execute(stmt) + + diff --git a/src/db/helpers/session/types.py b/src/db/helpers/session/types.py new file mode 100644 index 00000000..b960b76c --- /dev/null +++ b/src/db/helpers/session/types.py @@ -0,0 +1,8 @@ +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.update import BulkUpdatableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + +BulkActionType = ( + BulkInsertableModel | BulkUpdatableModel | BulkDeletableModel | BulkUpsertableModel +) diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/instantiations/agency/pydantic/upsert.py index 4666a878..9a869e84 100644 --- a/src/db/models/instantiations/agency/pydantic/upsert.py +++ b/src/db/models/instantiations/agency/pydantic/upsert.py @@ -2,17 +2,17 @@ from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.templates import Base -from src.db.templates.upsert import UpsertModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel -class AgencyUpsertModel(UpsertModel): +class AgencyUpsertModel(BulkUpsertableModel): - @property - def id_field(self) -> str: + @classmethod + def id_field(cls) -> str: return "agency_id" - @property - def sa_model(self) -> type[Base]: + @classmethod + def sa_model(cls) -> type[Base]: return Agency agency_id: int diff --git a/src/db/models/instantiations/link/url_agency/pydantic.py b/src/db/models/instantiations/link/url_agency/pydantic.py index f76aa30a..75c02119 100644 --- a/src/db/models/instantiations/link/url_agency/pydantic.py +++ b/src/db/models/instantiations/link/url_agency/pydantic.py @@ -1,6 +1,15 @@ -from pydantic import BaseModel +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel -class LinkURLAgencyUpsertModel(BaseModel): +class LinkURLAgencyPydantic( + BulkDeletableModel, + BulkInsertableModel +): url_id: int - agency_ids: list[int] \ No newline at end of file + agency_id: int + + @classmethod + def sa_model(cls) -> type[LinkURLAgency]: + return LinkURLAgency \ No newline at end of file diff --git a/src/db/models/instantiations/link/url_agency/sqlalchemy.py b/src/db/models/instantiations/link/url_agency/sqlalchemy.py index 4bda5eaa..28e42924 100644 --- a/src/db/models/instantiations/link/url_agency/sqlalchemy.py +++ b/src/db/models/instantiations/link/url_agency/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import UniqueConstraint, Column +from sqlalchemy import UniqueConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column diff --git a/src/db/models/instantiations/url/core/pydantic/info.py b/src/db/models/instantiations/url/core/pydantic.py similarity index 100% rename from src/db/models/instantiations/url/core/pydantic/info.py rename to src/db/models/instantiations/url/core/pydantic.py diff --git a/src/db/models/instantiations/url/core/pydantic/upsert.py b/src/db/models/instantiations/url/core/pydantic/upsert.py deleted file mode 100644 index 3492b271..00000000 --- a/src/db/models/instantiations/url/core/pydantic/upsert.py +++ /dev/null @@ -1,23 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.models.templates import Base -from src.db.templates.upsert import UpsertModel -from src.db.models.instantiations.url.core.sqlalchemy import URL - - -class URLUpsertModel(UpsertModel): - - @property - def id_field(self) -> str: - return "id" - - @property - def sa_model(self) -> type[Base]: - return URL - - id: int - name: str - description: str - collector_metadata: dict | None = None - outcome: URLStatus - record_type: RecordType \ No newline at end of file diff --git a/src/db/models/instantiations/url/data_source/__init__.py b/src/db/models/instantiations/url/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/data_source/pydantic.py b/src/db/models/instantiations/url/data_source/pydantic.py new file mode 100644 index 00000000..00da8c5e --- /dev/null +++ b/src/db/models/instantiations/url/data_source/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLDataSourcePydantic(BulkInsertableModel): + data_source_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[URLDataSource]: + return URLDataSource \ No newline at end of file diff --git a/src/db/models/instantiations/url/data_source.py b/src/db/models/instantiations/url/data_source/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/data_source.py rename to src/db/models/instantiations/url/data_source/sqlalchemy.py diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index 1295fbd1..4b5fd118 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -3,7 +3,7 @@ from sqlalchemy import FromClause, ColumnClause from sqlalchemy.ext.asyncio import AsyncSession -from src.db import session_helper as sh +from src.db.helpers.session import session_helper as sh from src.db.types import LabelsType diff --git a/src/db/session_helper.py b/src/db/session_helper.py deleted file mode 100644 index f86d968d..00000000 --- a/src/db/session_helper.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -session_helper (aliased as sh) contains a number of convenience -functions for workings with a SQLAlchemy session -""" -from typing import Any, Optional - -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.dialects.postgresql import insert as pg_insert - -from src.db.models.templates import Base -from src.db.templates.upsert import UpsertModel - - -async def scalar(session: AsyncSession, query: sa.Select) -> Any: - """Fetch the first column of the first row.""" - raw_result = await session.execute(query) - return raw_result.scalar() - -async def scalars(session: AsyncSession, query: sa.Select) -> Any: - raw_result = await session.execute(query) - return raw_result.scalars().all() - -async def mapping(session: AsyncSession, query: sa.Select) -> sa.RowMapping: - raw_result = await session.execute(query) - return raw_result.mappings().one() - - -async def bulk_upsert( - session: AsyncSession, - models: list[UpsertModel], -): - if len(models) == 0: - return - - first_model = models[0] - - query = pg_insert(first_model.sa_model) - - mappings = [upsert_model.model_dump() for upsert_model in models] - - set_ = {} - for k, v in mappings[0].items(): - if k == first_model.id_field: - continue - set_[k] = getattr(query.excluded, k) - - query = query.on_conflict_do_update( - index_elements=[first_model.id_field], - set_=set_ - ) - - # Note, mapping must include primary key - await session.execute( - query, - mappings - ) - -async def add( - session: AsyncSession, - model: Base, - return_id: bool = False -) -> int | None: - session.add(model) - if return_id: - if not hasattr(model, "id"): - raise AttributeError("Models must have an id attribute") - await session.flush() - return model.id - return None - - -async def add_all( - session: AsyncSession, - models: list[Base], - return_ids: bool = False -) -> list[int] | None: - session.add_all(models) - if return_ids: - if not hasattr(models[0], "id"): - raise AttributeError("Models must have an id attribute") - await session.flush() - return [ - model.id # pyright: ignore [reportAttributeAccessIssue] - for model in models - ] - return None - -async def get_all( - session: AsyncSession, - model: Base, - order_by_attribute: Optional[str] = None -) -> list[Base]: - """ - Get all records of a model - Used primarily in testing - """ - statement = sa.select(model) - if order_by_attribute: - statement = statement.order_by(getattr(model, order_by_attribute)) - result = await session.execute(statement) - return result.scalars().all() - -def compile_to_sql(statement) -> str: - compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) - return compiled_sql \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index dfac8c9c..518aafc2 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -7,9 +7,9 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.link.task_url import LinkTaskURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata diff --git a/src/db/templates/markers/__init__.py b/src/db/templates/markers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/markers/bulk/__init__.py b/src/db/templates/markers/bulk/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/markers/bulk/delete.py b/src/db/templates/markers/bulk/delete.py new file mode 100644 index 00000000..9da0c980 --- /dev/null +++ b/src/db/templates/markers/bulk/delete.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class BulkDeletableModel(BaseModel): + """Identifies a model that can be used for the bulk_delete function in session_helper.""" + diff --git a/src/db/templates/markers/bulk/insert.py b/src/db/templates/markers/bulk/insert.py new file mode 100644 index 00000000..d147e44f --- /dev/null +++ b/src/db/templates/markers/bulk/insert.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkInsertableModel(BaseModel): + """Identifies a model that can be used for the bulk_insert function in session_helper.""" diff --git a/src/db/templates/markers/bulk/update.py b/src/db/templates/markers/bulk/update.py new file mode 100644 index 00000000..d0476135 --- /dev/null +++ b/src/db/templates/markers/bulk/update.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkUpdatableModel(BaseModel): + """Identifies a model that can be used for the bulk_update function in session_helper.""" diff --git a/src/db/templates/markers/bulk/upsert.py b/src/db/templates/markers/bulk/upsert.py new file mode 100644 index 00000000..86d683bb --- /dev/null +++ b/src/db/templates/markers/bulk/upsert.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkUpsertableModel(BaseModel): + """Identifies a model that can be used for the bulk_upsert function in session_helper.""" \ No newline at end of file diff --git a/src/db/templates/protocols/__init__.py b/src/db/templates/protocols/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/protocols/has_id.py b/src/db/templates/protocols/has_id.py new file mode 100644 index 00000000..fc3519a2 --- /dev/null +++ b/src/db/templates/protocols/has_id.py @@ -0,0 +1,6 @@ +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class HasIDProtocol(Protocol): + id: int \ No newline at end of file diff --git a/src/db/templates/protocols/sa_correlated/__init__.py b/src/db/templates/protocols/sa_correlated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/protocols/sa_correlated/core.py b/src/db/templates/protocols/sa_correlated/core.py new file mode 100644 index 00000000..6b77c835 --- /dev/null +++ b/src/db/templates/protocols/sa_correlated/core.py @@ -0,0 +1,15 @@ +from abc import abstractmethod +from typing import Protocol, runtime_checkable + +from src.db.models.templates import Base + + +@runtime_checkable +class SQLAlchemyCorrelatedProtocol(Protocol): + + + @classmethod + @abstractmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + pass diff --git a/src/db/templates/protocols/sa_correlated/with_id.py b/src/db/templates/protocols/sa_correlated/with_id.py new file mode 100644 index 00000000..4e3609e1 --- /dev/null +++ b/src/db/templates/protocols/sa_correlated/with_id.py @@ -0,0 +1,20 @@ +from abc import abstractmethod +from typing import Protocol, runtime_checkable + +from src.db.models.templates import Base + + +@runtime_checkable +class SQLAlchemyCorrelatedWithIDProtocol(Protocol): + + @classmethod + @abstractmethod + def id_field(cls) -> str: + """Defines the field to be used as the primary key.""" + return "id" + + @classmethod + @abstractmethod + def sa_model(cls) -> type[Base]: + """Defines the correlated SQLAlchemy model.""" + pass diff --git a/src/db/templates/upsert.py b/src/db/templates/upsert.py deleted file mode 100644 index d80de944..00000000 --- a/src/db/templates/upsert.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import ABC, abstractmethod - -from pydantic import BaseModel - -from src.db.models.templates import Base - - -class UpsertModel(BaseModel, ABC): - """An abstract base class for encapsulating upsert operations.""" - - @property - def id_field(self) -> str: - """Defines the field to be used as the primary key.""" - return "id" - - @property - @abstractmethod - def sa_model(self) -> type[Base]: - """Defines the SQLAlchemy model to be upserted.""" - pass \ No newline at end of file diff --git a/src/db/utils/validate.py b/src/db/utils/validate.py new file mode 100644 index 00000000..077b7752 --- /dev/null +++ b/src/db/utils/validate.py @@ -0,0 +1,13 @@ +from typing import Protocol + +from pydantic import BaseModel + + +def validate_has_protocol(obj: object, protocol: type[Protocol]): + if not isinstance(obj, protocol): + raise TypeError(f"Class must implement {protocol} protocol.") + +def validate_all_models_of_same_type(objects: list[object]): + first_model = objects[0] + if not all(isinstance(model, type(first_model)) for model in objects): + raise TypeError("Models must be of the same type") \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index d0fe5464..a68179fe 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -3,7 +3,7 @@ from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo diff --git a/src/external/pdap/dtos/sync/data_sources.py b/src/external/pdap/dtos/sync/data_sources.py index b7e275e9..a5fe92b9 100644 --- a/src/external/pdap/dtos/sync/data_sources.py +++ b/src/external/pdap/dtos/sync/data_sources.py @@ -10,7 +10,7 @@ class DataSourcesSyncResponseInnerInfo(BaseModel): id: int url: str name: str - description: str + description: str | None record_type: RecordType agency_ids: list[int] approval_status: ApprovalStatus diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index 4dcb3fdc..780484cc 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,7 +6,7 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 7af3807c..df783e84 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,7 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index d923d770..34bbc7b3 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 2f304219..a9aaf1fe 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -2,7 +2,7 @@ from src.core.enums import BatchStatus from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo @pytest.mark.asyncio diff --git a/tests/automated/integration/db/structure/test_batch.py b/tests/automated/integration/db/structure/test_batch.py index 7f7bfcf3..f905b178 100644 --- a/tests/automated/integration/db/structure/test_batch.py +++ b/tests/automated/integration/db/structure/test_batch.py @@ -4,7 +4,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import get_enum_values from tests.automated.integration.db.structure.testers.models.column import ColumnTester from tests.automated.integration.db.structure.testers.table import TableTester diff --git a/tests/automated/integration/db/structure/testers/table.py b/tests/automated/integration/db/structure/testers/table.py index ca594eb4..aed5d3a5 100644 --- a/tests/automated/integration/db/structure/testers/table.py +++ b/tests/automated/integration/db/structure/testers/table.py @@ -6,7 +6,7 @@ from sqlalchemy.dialects import postgresql from sqlalchemy.exc import DataError -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from src.db.models.templates import Base from tests.automated.integration.db.structure.testers.models.column import ColumnTester from tests.automated.integration.db.structure.types import ConstraintTester, SATypes diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py index 22d5424d..d034def8 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py index ddc7b9d6..787a60f0 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py @@ -21,7 +21,7 @@ sync_response_order=SyncResponseOrder.FIRST ), sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 1', + name='Pre-existing URL 1 Name', description='Pre-existing URL 1 Description', record_type=RecordType.ACCIDENT_REPORTS, url_status=URLStatus.PENDING, @@ -64,7 +64,7 @@ ds_info=TestDSURLSetupEntry( id=102, name='New URL 4 Name', - description='New URL 4 Description', + description=None, url_status=DataSourcesURLStatus.OK, approval_status=ApprovalStatus.REJECTED, record_type=RecordType.ACCIDENT_REPORTS, @@ -80,7 +80,7 @@ ds_info=TestDSURLSetupEntry( id=103, name='New URL 5 Name', - description='New URL 5 Description', + description=None, url_status=DataSourcesURLStatus.OK, approval_status=ApprovalStatus.APPROVED, record_type=RecordType.INCARCERATION_RECORDS, @@ -95,7 +95,6 @@ agencies_assigned=[] ), final_url_status=URLStatus.VALIDATED - ) ] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py index 0720edfa..79f44f88 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py @@ -24,6 +24,9 @@ def __init__( self.url_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} self.ds_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} + self.sync_response_order_to_setup_record: dict[ + SyncResponseOrder, list[TestURLPostSetupRecord] + ] = defaultdict(list) self.response_dict: dict[ SyncResponseOrder, list[DataSourcesSyncResponseInnerInfo] @@ -60,13 +63,17 @@ async def setup_entry( self.url_id_to_setup_record[result.url_id] = result if result.data_sources_id is not None: self.ds_id_to_setup_record[result.data_sources_id] = result + if entry.ds_info is not None: + self.sync_response_order_to_setup_record[ + entry.ds_info.sync_response_order + ].append(result) async def setup_agencies(self): await self.agency_assignment_manager.setup() async def get_data_sources_sync_responses( self, - orders: list[SyncResponseOrder] + orders: list[SyncResponseOrder | ValueError] ) -> list[DataSourcesSyncResponseInfo]: results = [] for order in orders: @@ -93,4 +100,12 @@ async def check_results(self): for url_id in self.url_id_to_setup_record.keys(): await self.check_via_url(url_id) for data_source_id in self.ds_id_to_setup_record.keys(): - await self.check_via_data_source(data_source_id) \ No newline at end of file + await self.check_via_data_source(data_source_id) + + async def check_via_sync_response_order(self, order: SyncResponseOrder): + records = self.sync_response_order_to_setup_record[order] + for record in records: + builder = CheckURLQueryBuilder( + record=record + ) + await self.adb_client.run_query_builder(builder) diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py index 5cd8aeb4..c9055749 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py @@ -2,12 +2,11 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord +from src.db.helpers.session import session_helper as sh class CheckURLQueryBuilder(QueryBuilderBase): @@ -27,18 +26,15 @@ async def run(self, session: AsyncSession) -> None: selectinload(URL.data_source), selectinload(URL.confirmed_agencies), ) - .join(URLDataSource, URL.id == URLDataSource.data_source_id) - .outerjoin(LinkURLAgency, URL.id == LinkURLAgency.url_id) - .join(Agency, LinkURLAgency.agency_id == Agency.agency_id) + .outerjoin(URLDataSource, URL.id == URLDataSource.url_id) ) if self.record.url_id is not None: query = query.where(URL.id == self.record.url_id) if self.record.data_sources_id is not None: - query = query.where(URLDataSource.id == self.record.data_sources_id) + query = query.where(URLDataSource.data_source_id == self.record.data_sources_id) - raw_result = await session.execute(query) - result = raw_result.scalars().one_or_none() - assert result is not None + result = await sh.one_or_none(session=session, query=query) + assert result is not None, f"URL not found for {self.record}" await self.check_results(result) async def check_results(self, url: URL): diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py index 8edbbf33..2c563f09 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py @@ -1,7 +1,7 @@ from pendulum import today from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py index cadcfb4a..5112dd1f 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py @@ -12,7 +12,7 @@ class TestDSURLSetupEntry(BaseModel): """ id: int # ID of URL in DS App name: str - description: str + description: str | None url_status: DataSourcesURLStatus approval_status: ApprovalStatus record_type: RecordType diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py index b0f98c3f..0b71b28c 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py @@ -2,7 +2,7 @@ import pytest -from src.core.tasks.scheduled.sync.data_sources.dtos.parameters import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources @@ -56,8 +56,6 @@ async def test_data_sources_sync_happy_path( ) await check_sync_concluded(adb_client, check_updated_at=False) - # TODO: Fill in additional components - # Check results according to expectations. await manager.check_results() diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py index e69de29b..955c33fb 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py @@ -0,0 +1,65 @@ +import pytest +from sqlalchemy import select + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager + + + +@pytest.mark.asyncio +async def test_data_sources_sync_interruption( + test_operator: SyncDataSourcesTaskOperator +): + adb_client = test_operator.adb_client + + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + first_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.FIRST] + ) + + with patch_sync_data_sources( + side_effects= + first_response + + [ValueError("test error")] + ): + run_info = await test_operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message + + await manager.check_via_sync_response_order(SyncResponseOrder.FIRST) + + # Second response should not be processed + with pytest.raises(AssertionError): + await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + + # Check sync state results + sync_state_results = await adb_client.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page == 2 + assert sync_state_results.last_full_sync_at is None + assert sync_state_results.current_cutoff_date is None + + second_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.SECOND, SyncResponseOrder.THIRD] + ) + with patch_sync_data_sources(second_response): + await test_operator.run_task(2) + + await check_sync_concluded(adb_client) + + await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + await manager.check_via_sync_response_order(SyncResponseOrder.THIRD) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py index e69de29b..f32a12ec 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py @@ -0,0 +1,59 @@ +from datetime import datetime +from unittest.mock import MagicMock + +import pytest + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_data_sources_sync_no_new_results( + test_operator: SyncDataSourcesTaskOperator +): + adb_client = test_operator.adb_client + + cutoff_date = datetime(2025, 5, 1).date() + + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + first_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.THIRD] + ) + + # Add cutoff date to database + await adb_client.add( + DataSourcesSyncState( + current_cutoff_date=cutoff_date + ) + ) + + with patch_sync_data_sources(first_response): + run_info = await test_operator.run_task(1) + assert_task_run_success(run_info) + mock_func: MagicMock = test_operator.pdap_client.sync_data_sources + + mock_func.assert_called_once_with( + DataSourcesSyncParameters( + cutoff_date=cutoff_date, + page=1 + ) + ) + await check_sync_concluded(adb_client, check_updated_at=False) + + # Check no syncs occurred + for sync_response_order in [SyncResponseOrder.FIRST, SyncResponseOrder.SECOND]: + with pytest.raises(AssertionError): + await manager.check_via_sync_response_order(sync_response_order) diff --git a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py index cfa2be99..4254c4ad 100644 --- a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.data_source import URLDataSource +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome diff --git a/tests/automated/unit/db/__init__.py b/tests/automated/unit/db/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/__init__.py b/tests/automated/unit/db/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/__init__.py b/tests/automated/unit/db/utils/validate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/mock/__init__.py b/tests/automated/unit/db/utils/validate/mock/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/mock/class_.py b/tests/automated/unit/db/utils/validate/mock/class_.py new file mode 100644 index 00000000..87b0d213 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/mock/class_.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from tests.automated.unit.db.utils.validate.mock.protocol import MockProtocol + + +class MockClassNoProtocol(BaseModel): + mock_attribute: str | None = None + +class MockClassWithProtocol(BaseModel, MockProtocol): + mock_attribute: str | None = None \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/mock/protocol.py b/tests/automated/unit/db/utils/validate/mock/protocol.py new file mode 100644 index 00000000..5a55d0fe --- /dev/null +++ b/tests/automated/unit/db/utils/validate/mock/protocol.py @@ -0,0 +1,7 @@ +from asyncio import Protocol + + +class MockProtocol(Protocol): + + def mock_method(self) -> None: + pass \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py b/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py new file mode 100644 index 00000000..8e325879 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py @@ -0,0 +1,17 @@ +import pytest + +from src.db.utils.validate import validate_all_models_of_same_type +from tests.automated.unit.db.utils.validate.mock.class_ import MockClassNoProtocol, MockClassWithProtocol + + +def test_validate_all_models_of_same_type_happy_path(): + + models = [MockClassNoProtocol() for _ in range(3)] + validate_all_models_of_same_type(models) + +def test_validate_all_models_of_same_type_error_path(): + + models = [MockClassNoProtocol() for _ in range(2)] + models.append(MockClassWithProtocol()) + with pytest.raises(TypeError): + validate_all_models_of_same_type(models) \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/test_has_protocol.py b/tests/automated/unit/db/utils/validate/test_has_protocol.py new file mode 100644 index 00000000..cfb820a3 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/test_has_protocol.py @@ -0,0 +1,17 @@ +import pytest + +from src.db.utils.validate import validate_has_protocol +from tests.automated.unit.db.utils.validate.mock.class_ import MockClassWithProtocol, MockClassNoProtocol +from tests.automated.unit.db.utils.validate.mock.protocol import MockProtocol + + +def test_validate_has_protocol_happy_path(): + + model = MockClassWithProtocol() + validate_has_protocol(model, MockProtocol) + +def test_validate_has_protocol_error_path(): + + model = MockClassNoProtocol() + with pytest.raises(TypeError): + validate_has_protocol(model, MockProtocol) \ No newline at end of file diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 22770205..2cc91449 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,7 +5,7 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index c54e624e..94c3fde6 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,7 +4,7 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index 863e614b..672936e0 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,7 +6,7 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO diff --git a/tests/conftest.py b/tests/conftest.py index ee9a6774..4e724563 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Generator, AsyncGenerator, Coroutine +from typing import Any, Generator, AsyncGenerator import pytest import pytest_asyncio @@ -7,11 +7,10 @@ from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker +from src.core.env_var_manager import EnvVarManager from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient -from src.db.helpers import get_postgres_connection_string -from src.db.models.templates import Base -from src.core.env_var_manager import EnvVarManager +from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import load_from_environment from tests.helpers.alembic_runner import AlembicRunner from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/helpers/db_data_creator.py b/tests/helpers/db_data_creator.py index 1f91bb05..a8d8331a 100644 --- a/tests/helpers/db_data_creator.py +++ b/tests/helpers/db_data_creator.py @@ -15,7 +15,7 @@ from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.client.sync import DatabaseClient from src.db.dtos.url.raw_html import RawHTMLInfo diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 7cf002f6..bc48da9f 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.db_data_creator import DBDataCreator URLS = [ From 2f1ef9ee1cfe7b95519f879b095a31e4e1fdb93f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 25 Jul 2025 20:34:38 -0400 Subject: [PATCH 6/6] Fix import --- tests/alembic/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/alembic/conftest.py b/tests/alembic/conftest.py index 405f5677..f50dee14 100644 --- a/tests/alembic/conftest.py +++ b/tests/alembic/conftest.py @@ -3,7 +3,7 @@ from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from tests.helpers.alembic_runner import AlembicRunner