diff --git a/alembic/env.py b/alembic/env.py index 3d305e32..2cf7e6c8 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -6,7 +6,7 @@ from sqlalchemy import engine_from_config from sqlalchemy import pool -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from src.db.models.templates import Base # this is the Alembic Config object, which provides diff --git a/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py new file mode 100644 index 00000000..9e990bc1 --- /dev/null +++ b/alembic/versions/2025_07_21_0637-59d2af1bab33_setup_for_sync_data_sources_task.py @@ -0,0 +1,285 @@ +"""Setup for sync data sources task + +Revision ID: 59d2af1bab33 +Revises: 9552d354ccf4 +Create Date: 2025-07-21 06:37:51.043504 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + +from src.util.alembic_helpers import switch_enum_type, id_column + +# revision identifiers, used by Alembic. +revision: str = '59d2af1bab33' +down_revision: Union[str, None] = '9552d354ccf4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +SYNC_STATE_TABLE_NAME = "data_sources_sync_state" +URL_DATA_SOURCES_METADATA_TABLE_NAME = "url_data_sources_metadata" + +CONFIRMED_AGENCY_TABLE_NAME = "confirmed_url_agency" +LINK_URLS_AGENCIES_TABLE_NAME = "link_urls_agencies" +CHANGE_LOG_TABLE_NAME = "change_log" + +AGENCIES_TABLE_NAME = "agencies" + +TABLES_TO_LOG = [ + LINK_URLS_AGENCIES_TABLE_NAME, + "urls", + "url_data_sources", + "agencies", +] + +OperationTypeEnum = sa.Enum("UPDATE", "DELETE", "INSERT", name="operation_type") + + +def upgrade() -> None: + _create_data_sources_sync_state_table() + _create_data_sources_sync_task() + + _rename_confirmed_url_agency_to_link_urls_agencies() + _create_change_log_table() + _add_jsonb_diff_val_function() + _create_log_table_changes_trigger() + + + _add_table_change_log_triggers() + _add_agency_id_column() + + + +def downgrade() -> None: + _drop_data_sources_sync_task() + _drop_data_sources_sync_state_table() + _drop_change_log_table() + _drop_table_change_log_triggers() + _drop_jsonb_diff_val_function() + _drop_log_table_changes_trigger() + + _rename_link_urls_agencies_to_confirmed_url_agency() + + OperationTypeEnum.drop(op.get_bind()) + _drop_agency_id_column() + + + +def _add_jsonb_diff_val_function() -> None: + op.execute( + """ + CREATE OR REPLACE FUNCTION jsonb_diff_val(val1 JSONB, val2 JSONB) + RETURNS JSONB AS + $$ + DECLARE + result JSONB; + v RECORD; + BEGIN + result = val1; + FOR v IN SELECT * FROM jsonb_each(val2) + LOOP + IF result @> jsonb_build_object(v.key, v.value) + THEN + result = result - v.key; + ELSIF result ? v.key THEN + CONTINUE; + ELSE + result = result || jsonb_build_object(v.key, 'null'); + END IF; + END LOOP; + RETURN result; + END; + $$ LANGUAGE plpgsql; + """ + ) + +def _drop_jsonb_diff_val_function() -> None: + op.execute("DROP FUNCTION IF EXISTS jsonb_diff_val(val1 JSONB, val2 JSONB)") + +def _create_log_table_changes_trigger() -> None: + op.execute( + f""" + CREATE OR REPLACE FUNCTION public.log_table_changes() + RETURNS trigger + LANGUAGE 'plpgsql' + COST 100 + VOLATILE NOT LEAKPROOF + AS $BODY$ + DECLARE + old_values JSONB; + new_values JSONB; + old_to_new JSONB; + new_to_old JSONB; + BEGIN + -- Handle DELETE operations (store entire OLD row since all data is lost) + IF (TG_OP = 'DELETE') THEN + old_values = row_to_json(OLD)::jsonb; + + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data) + VALUES ('DELETE', TG_TABLE_NAME, OLD.id, old_values); + + RETURN OLD; + + -- Handle UPDATE operations (only log the changed columns) + ELSIF (TG_OP = 'UPDATE') THEN + old_values = row_to_json(OLD)::jsonb; + new_values = row_to_json(NEW)::jsonb; + new_to_old = jsonb_diff_val(old_values, new_values); + old_to_new = jsonb_diff_val(new_values, old_values); + + -- Skip logging if both old_to_new and new_to_old are NULL or empty JSON objects + IF (new_to_old IS NOT NULL AND new_to_old <> '{{}}') OR + (old_to_new IS NOT NULL AND old_to_new <> '{{}}') THEN + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data, new_data) + VALUES ('UPDATE', TG_TABLE_NAME, OLD.id, new_to_old, old_to_new); + END IF; + + RETURN NEW; + + -- Handle INSERT operations + ELSIF (TG_OP = 'INSERT') THEN + new_values = row_to_json(NEW)::jsonb; + + -- Skip logging if new_values is NULL or an empty JSON object + IF new_values IS NOT NULL AND new_values <> '{{}}' THEN + INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, new_data) + VALUES ('INSERT', TG_TABLE_NAME, NEW.id, new_values); + END IF; + + RETURN NEW; + END IF; + END; + $BODY$; + """ + ) + +def _drop_log_table_changes_trigger() -> None: + op.execute(f"DROP TRIGGER IF EXISTS log_table_changes ON {URL_DATA_SOURCES_METADATA_TABLE_NAME}") + +def _create_data_sources_sync_state_table() -> None: + table = op.create_table( + SYNC_STATE_TABLE_NAME, + id_column(), + sa.Column('last_full_sync_at', sa.DateTime(), nullable=True), + sa.Column('current_cutoff_date', sa.Date(), nullable=True), + sa.Column('current_page', sa.Integer(), nullable=True), + ) + # Add row to `data_sources_sync_state` table + op.bulk_insert( + table, + [ + { + "last_full_sync_at": None, + "current_cutoff_date": None, + "current_page": None + } + ] + ) + +def _drop_data_sources_sync_state_table() -> None: + op.drop_table(SYNC_STATE_TABLE_NAME) + +def _create_data_sources_sync_task() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources' + ] + ) + +def _drop_data_sources_sync_task() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + ] + ) + +def _create_change_log_table() -> None: + # Create change_log table + op.create_table( + CHANGE_LOG_TABLE_NAME, + id_column(), + sa.Column("operation_type", OperationTypeEnum, nullable=False), + sa.Column("table_name", sa.String(), nullable=False), + sa.Column("affected_id", sa.Integer(), nullable=False), + sa.Column("old_data", JSONB, nullable=True), + sa.Column("new_data", JSONB, nullable=True), + sa.Column( + "created_at", sa.DateTime(), server_default=sa.func.now(), nullable=False + ), + ) + +def _drop_change_log_table() -> None: + op.drop_table(CHANGE_LOG_TABLE_NAME) + +def _rename_confirmed_url_agency_to_link_urls_agencies() -> None: + op.rename_table(CONFIRMED_AGENCY_TABLE_NAME, LINK_URLS_AGENCIES_TABLE_NAME) + +def _rename_link_urls_agencies_to_confirmed_url_agency() -> None: + op.rename_table(LINK_URLS_AGENCIES_TABLE_NAME, CONFIRMED_AGENCY_TABLE_NAME) + +def _add_table_change_log_triggers() -> None: + # Create trigger for tables: + def create_table_trigger(table_name: str) -> None: + op.execute( + """ + CREATE OR REPLACE TRIGGER log_{table_name}_changes + BEFORE INSERT OR DELETE OR UPDATE + ON public.{table_name} + FOR EACH ROW + EXECUTE FUNCTION public.log_table_changes(); + """.format(table_name=table_name) + ) + + for table_name in TABLES_TO_LOG: + create_table_trigger(table_name) + +def _drop_table_change_log_triggers() -> None: + def drop_table_trigger(table_name: str) -> None: + op.execute( + f""" + DROP TRIGGER log_{table_name}_changes + ON public.{table_name} + """ + ) + + for table_name in TABLES_TO_LOG: + drop_table_trigger(table_name) + +def _add_agency_id_column(): + op.add_column( + AGENCIES_TABLE_NAME, + id_column(), + ) + + +def _drop_agency_id_column(): + op.drop_column( + AGENCIES_TABLE_NAME, + 'id', + ) diff --git a/apply_migrations.py b/apply_migrations.py index 6b3188f3..2b217c8b 100644 --- a/apply_migrations.py +++ b/apply_migrations.py @@ -1,7 +1,8 @@ from alembic import command from alembic.config import Config -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string + def apply_migrations(): print("Applying migrations...") diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 15f5b631..31b858c5 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,8 +5,8 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.types import UserSuggestionType diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index 3bda8ff3..50b77d0a 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,8 +5,8 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py index f1ab8b67..14a00260 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py @@ -3,7 +3,7 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.core.enums import SuggestionType -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 5bfd6e8a..27f7a382 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,9 +9,9 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion @@ -72,11 +72,11 @@ async def run( ) ) # Must not have confirmed agencies - .join(ConfirmedURLAgency, isouter=True) + .join(LinkURLAgency, isouter=True) .where( ~exists( - select(ConfirmedURLAgency). - where(ConfirmedURLAgency.url_id == URL.id). + select(LinkURLAgency). + where(LinkURLAgency.url_id == URL.id). correlate(URL) ) ) diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index 1191e8d6..2db7191a 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -4,15 +4,14 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder -from src.api.endpoints.annotate.agency.get.queries.next_for_annotation import GetNextURLAgencyForAnnotationQueryBuilder from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/api/endpoints/annotate/relevance/get/query.py b/src/api/endpoints/annotate/relevance/get/query.py index ffd37d2c..11e509d0 100644 --- a/src/api/endpoints/annotate/relevance/get/query.py +++ b/src/api/endpoints/annotate/relevance/get/query.py @@ -5,10 +5,9 @@ GetNextURLForUserAnnotationQueryBuilder from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseInfo, \ RelevanceAnnotationResponseInfo -from src.core.tasks.url.operators.auto_relevant.models.annotation import RelevanceAnnotationInfo from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/dtos/get/logs.py b/src/api/endpoints/batch/dtos/get/logs.py index a350caa1..437e53cd 100644 --- a/src/api/endpoints/batch/dtos/get/logs.py +++ b/src/api/endpoints/batch/dtos/get/logs.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.log import LogOutputInfo +from src.db.models.instantiations.log.pydantic.output import LogOutputInfo class GetBatchLogsResponse(BaseModel): diff --git a/src/api/endpoints/batch/duplicates/dto.py b/src/api/endpoints/batch/duplicates/dto.py index 3838be77..b3fe5f17 100644 --- a/src/api/endpoints/batch/duplicates/dto.py +++ b/src/api/endpoints/batch/duplicates/dto.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from src.db.dtos.duplicate import DuplicateInfo +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo class GetDuplicatesByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index a4c3aa31..1f958a62 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -2,11 +2,11 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import aliased -from src.db.dtos.duplicate import DuplicateInfo -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.duplicate import Duplicate -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 40b1e753..13e8659c 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index fcfba3ee..49b95e13 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,9 +1,9 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.dtos.url.core import URLInfo -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 2f29a357..03e2cc36 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -5,9 +5,9 @@ from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py index 12616a22..8d5f0f56 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesAggregatedInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 771543ac..ad15c398 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesBreakdownInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/review/approve/query.py b/src/api/endpoints/review/approve/query.py index bff32bf3..c2eb8cbf 100644 --- a/src/api/endpoints/review/approve/query.py +++ b/src/api/endpoints/review/approve/query.py @@ -9,9 +9,9 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -104,7 +104,7 @@ def update_if_not_none( session.add(agency) # If the new agency id is not in the existing agency ids, add it - confirmed_url_agency = ConfirmedURLAgency( + confirmed_url_agency = LinkURLAgency( url_id=self.approval_info.url_id, agency_id=new_agency_id ) diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 8f7d5e35..0ec83dc1 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -1,6 +1,6 @@ from typing import Optional, Type -from sqlalchemy import FromClause, select, and_, Select, desc, asc, func, join +from sqlalchemy import FromClause, select, and_, Select, desc, asc, func from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload @@ -8,14 +8,14 @@ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.collectors.enums import URLStatus from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info -from src.db.constants import USER_ANNOTATION_MODELS, ALL_ANNOTATION_MODELS +from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin @@ -44,7 +44,7 @@ def __init__(self, batch_id: Optional[int] = None): self.double_join_relationships = [ (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), - (URL.confirmed_agencies, ConfirmedURLAgency.agency) + (URL.confirmed_agencies, LinkURLAgency.agency) ] self.count_label = "count" diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 50bee0bc..e7afa439 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index 411ad7f7..9213aa90 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -3,8 +3,8 @@ from pydantic import BaseModel -from src.db.dtos.url.error import URLErrorPydanticInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.enums import TaskType from src.core.enums import BatchStatus diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index a57b9daf..8133085f 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,11 +5,11 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.dtos.url.core import URLInfo -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index 1ba5a75f..8bdb97bd 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -5,8 +5,8 @@ from src.api.endpoints.url.get.dto import GetURLsResponseInfo, GetURLsResponseErrorInfo, GetURLsResponseInnerInfo from src.collectors.enums import URLStatus from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.error_info import URLErrorInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/collectors/source_collectors/base.py b/src/collectors/source_collectors/base.py index 5fbb08c5..32cd3a48 100644 --- a/src/collectors/source_collectors/base.py +++ b/src/collectors/source_collectors/base.py @@ -8,7 +8,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger from src.core.function_trigger import FunctionTrigger diff --git a/src/core/core.py b/src/core/core.py index 78554b39..ec82e3c5 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager @@ -297,7 +297,6 @@ async def approve_url( user_id=access_info.user_id ) - async def reject_url( self, url_id: int, diff --git a/src/core/logger.py b/src/core/logger.py index e49dd057..804edffd 100644 --- a/src/core/logger.py +++ b/src/core/logger.py @@ -1,7 +1,7 @@ import asyncio from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo class AsyncCoreLogger: diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index e827c77d..460cf0e0 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index dea8df10..beb31cb7 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index c07d4ab5..b72ee3c9 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 9a7e1d04..16f5d730 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index dfc7338a..691d23c6 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index 281ea2f8..b42a198f 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,6 +1,6 @@ from typing import List -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index fb92dcb0..bd2e4b84 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -1,4 +1,5 @@ -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient @@ -15,9 +16,14 @@ def __init__( self.pdap_client = pdap_client - async def get_sync_agencies_task_operator(self): - operator = SyncAgenciesTaskOperator( + async def get_sync_agencies_task_operator(self) -> SyncAgenciesTaskOperator: + return SyncAgenciesTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + + async def get_sync_data_sources_task_operator(self) -> SyncDataSourcesTaskOperator: + return SyncDataSourcesTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ) - return operator \ No newline at end of file diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index 44576cfa..66b50535 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -6,7 +6,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -from src.core.tasks.scheduled.operators.base import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase class AsyncScheduledTaskManager: @@ -30,6 +30,7 @@ def __init__( self.delete_logs_job = None self.populate_backlog_snapshot_job = None self.sync_agencies_job = None + self.sync_data_sources_job = None async def setup(self): self.scheduler.start() @@ -68,6 +69,16 @@ async def add_scheduled_tasks(self): "operator": await self.loader.get_sync_agencies_task_operator() } ) + self.sync_data_sources_job = self.scheduler.add_job( + self.run_task, + trigger=IntervalTrigger( + days=1, + start_date=datetime.now() + timedelta(minutes=3) + ), + kwargs={ + "operator": await self.loader.get_sync_data_sources_task_operator() + } + ) def shutdown(self): if self.scheduler.running: diff --git a/src/core/tasks/scheduled/operators/__init__.py b/src/core/tasks/scheduled/sync/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/__init__.py rename to src/core/tasks/scheduled/sync/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/__init__.py b/src/core/tasks/scheduled/sync/agency/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/__init__.py rename to src/core/tasks/scheduled/sync/agency/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/dtos/__init__.py b/src/core/tasks/scheduled/sync/agency/dtos/__init__.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/dtos/__init__.py rename to src/core/tasks/scheduled/sync/agency/dtos/__init__.py diff --git a/src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py b/src/core/tasks/scheduled/sync/agency/dtos/parameters.py similarity index 69% rename from src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py rename to src/core/tasks/scheduled/sync/agency/dtos/parameters.py index 3d8cceb4..5afa53f1 100644 --- a/src/core/tasks/scheduled/operators/agency_sync/dtos/parameters.py +++ b/src/core/tasks/scheduled/sync/agency/dtos/parameters.py @@ -5,5 +5,5 @@ class AgencySyncParameters(BaseModel): - cutoff_date: Optional[date] - page: Optional[int] + cutoff_date: date | None + page: int | None diff --git a/src/core/tasks/scheduled/operators/agency_sync/core.py b/src/core/tasks/scheduled/sync/agency/operator.py similarity index 68% rename from src/core/tasks/scheduled/operators/agency_sync/core.py rename to src/core/tasks/scheduled/sync/agency/operator.py index c522effd..7b8c1a80 100644 --- a/src/core/tasks/scheduled/operators/agency_sync/core.py +++ b/src/core/tasks/scheduled/sync/agency/operator.py @@ -1,7 +1,6 @@ -from src.core.tasks.scheduled.operators.agency_sync.constants import MAX_SYNC_REQUESTS -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.operators.agency_sync.exceptions import MaxRequestsExceededError -from src.core.tasks.scheduled.operators.base import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.sync.check import check_max_sync_requests_not_exceeded +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient @@ -29,10 +28,7 @@ async def inner_task_logic(self): response = await self.pdap_client.sync_agencies(params) request_count = 1 while len(response.agencies) > 0: - if request_count > MAX_SYNC_REQUESTS: - raise MaxRequestsExceededError( - f"Max requests in a single task run ({MAX_SYNC_REQUESTS}) exceeded." - ) + check_max_sync_requests_not_exceeded(request_count) await self.adb_client.upsert_agencies(response.agencies) params = AgencySyncParameters( diff --git a/src/db/dtos/url/annotations/__init__.py b/src/core/tasks/scheduled/sync/agency/queries/__init__.py similarity index 100% rename from src/db/dtos/url/annotations/__init__.py rename to src/core/tasks/scheduled/sync/agency/queries/__init__.py diff --git a/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py b/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py new file mode 100644 index 00000000..8ff148e8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/get_sync_params.py @@ -0,0 +1,30 @@ +from sqlalchemy import select +from sqlalchemy.exc import NoResultFound +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgenciesSyncParametersQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AgencySyncParameters: + query = select( + AgenciesSyncState.current_page, + AgenciesSyncState.current_cutoff_date + ) + try: + result = (await session.execute(query)).mappings().one() + return AgencySyncParameters( + page=result['current_page'], + cutoff_date=result['current_cutoff_date'] + ) + except NoResultFound: + # Add value + state = AgenciesSyncState() + session.add(state) + return AgencySyncParameters(page=None, cutoff_date=None) + + + diff --git a/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py b/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py new file mode 100644 index 00000000..50e7642c --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/mark_full_sync.py @@ -0,0 +1,13 @@ +from sqlalchemy import update, func, text, Update + +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState + + +def get_mark_full_agencies_sync_query() -> Update: + return update( + AgenciesSyncState + ).values( + last_full_sync_at=func.now(), + current_cutoff_date=func.now() - text('interval \'1 day\''), + current_page=None + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py b/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py new file mode 100644 index 00000000..2055bdc9 --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/update_sync_progress.py @@ -0,0 +1,11 @@ +from sqlalchemy import Update, update + +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState + + +def get_update_agencies_sync_progress_query(page: int) -> Update: + return update( + AgenciesSyncState + ).values( + current_page=page + ) diff --git a/src/core/tasks/scheduled/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/sync/agency/queries/upsert.py new file mode 100644 index 00000000..64988cba --- /dev/null +++ b/src/core/tasks/scheduled/sync/agency/queries/upsert.py @@ -0,0 +1,20 @@ +from src.db.models.instantiations.agency.pydantic.upsert import AgencyUpsertModel +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def convert_agencies_sync_response_to_agencies_upsert( + agencies: list[AgenciesSyncResponseInnerInfo] +) -> list[AgencyUpsertModel]: + results = [] + for agency in agencies: + results.append( + AgencyUpsertModel( + agency_id=agency.agency_id, + name=agency.display_name, + state=agency.state_name, + county=agency.county_name, + locality=agency.locality_name, + ds_last_updated_at=agency.updated_at + ) + ) + return results \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/check.py b/src/core/tasks/scheduled/sync/check.py new file mode 100644 index 00000000..449506c5 --- /dev/null +++ b/src/core/tasks/scheduled/sync/check.py @@ -0,0 +1,14 @@ +from src.core.tasks.scheduled.sync.constants import MAX_SYNC_REQUESTS +from src.core.tasks.scheduled.sync.exceptions import MaxRequestsExceededError + + +def check_max_sync_requests_not_exceeded(request_count: int) -> None: + """ + Raises: + MaxRequestsExceededError: If the number of requests made exceeds the maximum allowed. + """ + + if request_count > MAX_SYNC_REQUESTS: + raise MaxRequestsExceededError( + f"Max requests in a single task run ({MAX_SYNC_REQUESTS}) exceeded." + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/operators/agency_sync/constants.py b/src/core/tasks/scheduled/sync/constants.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/constants.py rename to src/core/tasks/scheduled/sync/constants.py diff --git a/src/db/dtos/url/annotations/auto/__init__.py b/src/core/tasks/scheduled/sync/data_sources/__init__.py similarity index 100% rename from src/db/dtos/url/annotations/auto/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/operator.py b/src/core/tasks/scheduled/sync/data_sources/operator.py new file mode 100644 index 00000000..a88fc34a --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/operator.py @@ -0,0 +1,43 @@ +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.core.tasks.scheduled.sync.check import check_max_sync_requests_not_exceeded +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.external.pdap.client import PDAPClient + + +class SyncDataSourcesTaskOperator(ScheduledTaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + + @property + def task_type(self): + return TaskType.SYNC_DATA_SOURCES + + async def inner_task_logic(self): + params = await self.adb_client.get_data_sources_sync_parameters() + if params.page is None: + params.page = 1 + + response = await self.pdap_client.sync_data_sources(params) + request_count = 1 + while len(response.data_sources) > 0: + check_max_sync_requests_not_exceeded(request_count) + await self.adb_client.upsert_urls_from_data_sources(response.data_sources) + + params = DataSourcesSyncParameters( + page=params.page + 1, + cutoff_date=params.cutoff_date + ) + await self.adb_client.update_data_sources_sync_progress(params.page) + + response = await self.pdap_client.sync_data_sources(params) + request_count += 1 + + await self.adb_client.mark_full_data_sources_sync() diff --git a/src/core/tasks/scheduled/sync/data_sources/params.py b/src/core/tasks/scheduled/sync/data_sources/params.py new file mode 100644 index 00000000..8a502ef6 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/params.py @@ -0,0 +1,8 @@ +from datetime import date + +from pydantic import BaseModel + + +class DataSourcesSyncParameters(BaseModel): + cutoff_date: date | None + page: int | None diff --git a/src/db/queries/implementations/core/tasks/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/__init__.py similarity index 100% rename from src/db/queries/implementations/core/tasks/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py new file mode 100644 index 00000000..695813c6 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/get_sync_params.py @@ -0,0 +1,27 @@ +from sqlalchemy import select +from sqlalchemy.exc import NoResultFound +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourcesSyncParametersQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> DataSourcesSyncParameters: + query = select( + DataSourcesSyncState.current_page, + DataSourcesSyncState.current_cutoff_date + ) + try: + result = (await session.execute(query)).mappings().one() + return DataSourcesSyncParameters( + page=result['current_page'], + cutoff_date=result['current_cutoff_date'] + ) + except NoResultFound: + # Add value + state = DataSourcesSyncState() + session.add(state) + return DataSourcesSyncParameters(page=None, cutoff_date=None) diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py new file mode 100644 index 00000000..d896f765 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/mark_full_sync.py @@ -0,0 +1,13 @@ +from sqlalchemy import Update, update, func, text + +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState + + +def get_mark_full_data_sources_sync_query() -> Update: + return update( + DataSourcesSyncState + ).values( + last_full_sync_at=func.now(), + current_cutoff_date=func.now() - text('interval \'1 day\''), + current_page=None + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py b/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py new file mode 100644 index 00000000..d6ba80e8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/update_sync_progress.py @@ -0,0 +1,11 @@ +from sqlalchemy import update, Update + +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState + + +def get_update_data_sources_sync_progress_query(page: int) -> Update: + return update( + DataSourcesSyncState + ).values( + current_page=page + ) diff --git a/src/db/queries/implementations/core/tasks/agency_sync/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/__init__.py similarity index 100% rename from src/db/queries/implementations/core/tasks/agency_sync/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert/__init__.py diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/agency_sync/__init__.py rename to src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/__init__.py diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py new file mode 100644 index 00000000..05b6ec75 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/convert.py @@ -0,0 +1,14 @@ +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic + + +def convert_to_link_url_agency_models( + url_id: int, + agency_ids: list[int] +) -> list[LinkURLAgencyPydantic]: + return [ + LinkURLAgencyPydantic( + url_id=url_id, + agency_id=agency_id + ) + for agency_id in agency_ids + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py new file mode 100644 index 00000000..e1820898 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/core.py @@ -0,0 +1,13 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.query import URLAgencyLinkUpdateQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams + + +async def update_agency_links( + session: AsyncSession, + params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] +) -> None: + """Overwrite existing url_agency links with new ones, if applicable.""" + query = URLAgencyLinkUpdateQueryBuilder(params) + await query.run(session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py new file mode 100644 index 00000000..d43bbbd8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/params.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class UpdateLinkURLAgencyForDataSourcesSyncParams(BaseModel): + url_id: int + new_agency_ids: list[int] + old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py new file mode 100644 index 00000000..4850be39 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/agency/query.py @@ -0,0 +1,79 @@ +from collections import defaultdict + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): + """Given a set of URL-Agency links, remove all non-matching links and add new ones.""" + + + def __init__(self, models: list[UpdateLinkURLAgencyForDataSourcesSyncParams]): + super().__init__() + self.models = models + self._new_links: dict[int, list[int]] = { + model.url_id: model.new_agency_ids + for model in self.models + } + self._existing_links: dict[int, list[int]] = defaultdict(list) + self.existing_url_ids = {model.url_id for model in self.models} + + async def _get_existing_links(self, session: AsyncSession): + """Get existing agency links for provided URLs. + + Modifies: + self._existing_links + """ + query = ( + select(LinkURLAgency) + .where( + LinkURLAgency.url_id.in_( + self.existing_url_ids + ) + ) + ) + links = await session.scalars(query) + for link in links: + self._existing_links[link.url_id].append(link.agency_id) + + async def _update_links(self, session: AsyncSession): + # Remove all existing links not in new links + links_to_delete: list[LinkURLAgencyPydantic] = [] + links_to_insert: list[LinkURLAgencyPydantic] = [] + + for url_id in self.existing_url_ids: + new_agency_ids = self._new_links.get(url_id, []) + existing_agency_ids = self._existing_links.get(url_id, []) + # IDs to delete are existing agency ids that are not new agency ids + ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) + # IDs to insert are new agency ids that are not existing agency ids + ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) + + links_to_delete.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_delete) + ) + ) + links_to_insert.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_insert) + ) + ) + + await sh.bulk_delete(session=session, models=links_to_delete) + await sh.bulk_insert(session=session, models=links_to_insert) + + async def run(self, session: AsyncSession): + await self._get_existing_links(session=session) + await self._update_links(session=session) + + diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py new file mode 100644 index 00000000..a0517b45 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/core.py @@ -0,0 +1,94 @@ +from typing import final + +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ + get_mappings_for_urls_without_data_sources +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.param_manager import \ + UpsertURLsFromDataSourcesParamManager +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.requester import UpsertURLsFromDataSourcesDBRequester +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.db.dtos.url.mapping import URLMapping +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + + +@final +class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): + + def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): + super().__init__() + self.sync_infos = sync_infos + self.urls = {sync_info.url for sync_info in self.sync_infos} + self.param_manager = UpsertURLsFromDataSourcesParamManager( + mapper=URLSyncInfoMapper(self.sync_infos) + ) + self._session: AsyncSession | None = None + self._requester: UpsertURLsFromDataSourcesDBRequester | None = None + # Need to be able to add URL ids first before adding links or other attributes + + @property + def requester(self) -> UpsertURLsFromDataSourcesDBRequester: + """ + Modifies: + self._requester + """ + if self._requester is None: + self._requester = UpsertURLsFromDataSourcesDBRequester(self._session) + return self._requester + + @override + async def run(self, session: AsyncSession) -> None: + """ + Modifies: + self._session + """ + self._session = session + + lookup_results = await self._lookup_urls() + lookups_existing_urls = filter_for_urls_with_ids(lookup_results) + await self._update_existing_urls(lookups_existing_urls) + await self._update_agency_link(lookups_existing_urls) + mappings_without_data_sources = get_mappings_for_urls_without_data_sources(lookup_results) + await self._add_new_data_sources(mappings_without_data_sources) + + extant_urls = {lookup.url_info.url for lookup in lookups_existing_urls} + urls_to_add = list(self.urls - extant_urls) + if len(urls_to_add) == 0: + return + url_mappings = await self._add_new_urls(urls_to_add) + await self._add_new_data_sources(url_mappings) + await self._insert_agency_link(url_mappings) + + async def _lookup_urls(self): + lookup_results = await self.requester.lookup_urls(list(self.urls)) + return lookup_results + + async def _insert_agency_link(self, url_mappings: list[URLMapping]): + link_url_agency_insert_params = self.param_manager.insert_agency_link( + url_mappings + ) + await self.requester.add_new_agency_links(link_url_agency_insert_params) + + async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + link_url_agency_update_params = self.param_manager.update_agency_link( + lookups_existing_urls + ) + await self.requester.update_agency_links(link_url_agency_update_params) + + async def _add_new_data_sources(self, url_mappings: list[URLMapping]): + url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) + await self.requester.add_new_data_sources(url_ds_insert_params) + + async def _add_new_urls(self, urls: list[str]): + url_insert_params = self.param_manager.add_new_urls(urls) + url_mappings = await self.requester.add_new_urls(url_insert_params) + return url_mappings + + async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + update_params = self.param_manager.update_existing_urls(lookups_existing_urls) + await self.requester.update_existing_urls(update_params) + diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py new file mode 100644 index 00000000..10a05d8e --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/convert.py @@ -0,0 +1,64 @@ +from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus + + +def convert_to_source_collector_url_status( + ds_url_status: DataSourcesURLStatus, + ds_approval_status: ApprovalStatus +) -> URLStatus: + match ds_url_status: + case DataSourcesURLStatus.AVAILABLE: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.NONE_FOUND: + raise NotImplementedError("Logic not implemented for this status.") + case DataSourcesURLStatus.BROKEN: + return URLStatus.NOT_FOUND + case _: + pass + + match ds_approval_status: + case ApprovalStatus.APPROVED: + return URLStatus.VALIDATED + case ApprovalStatus.REJECTED: + return URLStatus.NOT_RELEVANT + case ApprovalStatus.NEEDS_IDENTIFICATION: + return URLStatus.PENDING + case ApprovalStatus.PENDING: + return URLStatus.PENDING + case _: + raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") + +def convert_to_url_update_params( + url_id: int, + sync_info: DataSourcesSyncResponseInnerInfo +) -> UpdateURLForDataSourcesSyncParams: + return UpdateURLForDataSourcesSyncParams( + id=url_id, + name=sync_info.name, + description=sync_info.description, + outcome=convert_to_source_collector_url_status( + ds_url_status=sync_info.url_status, + ds_approval_status=sync_info.approval_status + ), + record_type=sync_info.record_type + ) + +def convert_to_url_insert_params( + url: str, + sync_info: DataSourcesSyncResponseInnerInfo +) -> InsertURLForDataSourcesSyncParams: + return InsertURLForDataSourcesSyncParams( + url=url, + name=sync_info.name, + description=sync_info.description, + outcome=convert_to_source_collector_url_status( + ds_url_status=sync_info.url_status, + ds_approval_status=sync_info.approval_status + ), + record_type=sync_info.record_type + ) diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py new file mode 100644 index 00000000..ef23fcd2 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/helpers/filter.py @@ -0,0 +1,29 @@ +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.db.dtos.url.mapping import URLMapping + + +def filter_for_urls_with_ids( + lookup_results: list[LookupURLForDataSourcesSyncResponse] +) -> list[LookupURLForDataSourcesSyncResponse]: + return [ + lookup_result + for lookup_result in lookup_results + if lookup_result.url_info.url_id is not None + ] + +def get_mappings_for_urls_without_data_sources( + lookup_results: list[LookupURLForDataSourcesSyncResponse] +) -> list[URLMapping]: + lookups_without_data_sources = [ + lookup_result + for lookup_result in lookup_results + if lookup_result.data_source_id is None + ] + return [ + URLMapping( + url_id=lookup_result.url_info.url_id, + url=lookup_result.url_info.url + ) + for lookup_result in lookups_without_data_sources + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py new file mode 100644 index 00000000..a60904a0 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/mapper.py @@ -0,0 +1,13 @@ +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + + +class URLSyncInfoMapper: + + def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): + self._dict: dict[str, DataSourcesSyncResponseInnerInfo] = { + sync_info.url: sync_info + for sync_info in sync_infos + } + + def get(self, url: str) -> DataSourcesSyncResponseInnerInfo: + return self._dict[url] \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py new file mode 100644 index 00000000..19d8a0cd --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/param_manager.py @@ -0,0 +1,101 @@ +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import \ + UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ + convert_to_url_insert_params +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic + + +class UpsertURLsFromDataSourcesParamManager: + def __init__( + self, + mapper: URLSyncInfoMapper + ): + self._mapper = mapper + + def update_existing_urls( + self, + lookup_results: list[LookupURLForDataSourcesSyncResponse] + ) -> list[UpdateURLForDataSourcesSyncParams]: + results = [] + for lookup_result in lookup_results: + url_info = lookup_result.url_info + sync_info = self._mapper.get(url_info.url) + update_params = convert_to_url_update_params( + url_id=url_info.url_id, + sync_info=sync_info + ) + results.append(update_params) + return results + + def add_new_urls( + self, + urls: list[str] + ) -> list[InsertURLForDataSourcesSyncParams]: + results = [] + for url in urls: + sync_info = self._mapper.get(url) + insert_params = convert_to_url_insert_params( + url=url, + sync_info=sync_info + ) + results.append(insert_params) + return results + + def update_agency_link( + self, + lookup_results: list[LookupURLForDataSourcesSyncResponse] + ) -> list[UpdateLinkURLAgencyForDataSourcesSyncParams]: + results = [] + for lookup_result in lookup_results: + url_info = lookup_result.url_info + sync_info = self._mapper.get(url_info.url) + update_params = UpdateLinkURLAgencyForDataSourcesSyncParams( + url_id=url_info.url_id, + new_agency_ids=sync_info.agency_ids, + old_agency_ids=url_info.agency_ids + ) + results.append(update_params) + return results + + def insert_agency_link( + self, + url_mappings: list[URLMapping] + ) -> list[LinkURLAgencyPydantic]: + results = [] + for mapping in url_mappings: + sync_info = self._mapper.get(mapping.url) + for agency_id in sync_info.agency_ids: + results.append( + LinkURLAgencyPydantic( + url_id=mapping.url_id, + agency_id=agency_id + ) + ) + + return results + + def add_new_data_sources( + self, + mappings: list[URLMapping] + ) -> list[URLDataSourcePydantic]: + results = [] + for mapping in mappings: + sync_info = self._mapper.get(mapping.url) + results.append( + URLDataSourcePydantic( + data_source_id=sync_info.id, + url_id=mapping.url_id + ) + ) + return results + + diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py new file mode 100644 index 00000000..14a73ce8 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/requester.py @@ -0,0 +1,78 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.params import \ + UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.agency.query import \ + URLAgencyLinkUpdateQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.insert.params import \ + InsertURLForDataSourcesSyncParams +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.query import \ + LookupURLForDataSourcesSyncQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.update.params import \ + UpdateURLForDataSourcesSyncParams +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic + + +class UpsertURLsFromDataSourcesDBRequester: + + def __init__(self, session: AsyncSession): + self.session = session + + + async def add_new_urls( + self, + params: list[InsertURLForDataSourcesSyncParams] + ): + url_ids = await sh.bulk_insert( + session=self.session, + models=params, + return_ids=True + ) + results = [] + for insert_param, url_id in zip(params, url_ids): + results.append( + URLMapping( + url=insert_param.url, + url_id=url_id, + ) + ) + return results + + async def lookup_urls( + self, + urls: list[str], + ) -> list[LookupURLForDataSourcesSyncResponse]: + """Lookup URLs for data source sync-relevant information.""" + builder = LookupURLForDataSourcesSyncQueryBuilder(urls=urls) + return await builder.run(session=self.session) + + async def update_existing_urls( + self, + params: list[UpdateURLForDataSourcesSyncParams], + ) -> None: + await sh.bulk_update(session=self.session, models=params) + + async def add_new_data_sources( + self, + params: list[URLDataSourcePydantic] + ) -> None: + await sh.bulk_insert(session=self.session, models=params) + + async def add_new_agency_links( + self, + params: list[LinkURLAgencyPydantic] + ): + await sh.bulk_insert(session=self.session, models=params) + + async def update_agency_links( + self, + params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] + ) -> None: + """Overwrite existing url_agency links with new ones, if applicable.""" + query = URLAgencyLinkUpdateQueryBuilder(params) + await query.run(self.session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py new file mode 100644 index 00000000..1cab6e0d --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py @@ -0,0 +1,16 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class InsertURLForDataSourcesSyncParams(BulkInsertableModel): + url: str + name: str + description: str | None + outcome: URLStatus + record_type: RecordType + + @classmethod + def sa_model(cls) -> type[URL]: + return URL \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py new file mode 100644 index 00000000..027cf3c3 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/format.py @@ -0,0 +1,7 @@ + + + +def format_agency_ids_result(agency_ids: list[int | None]) -> list[int]: + if agency_ids == [None]: + return [] + return agency_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py new file mode 100644 index 00000000..f24c84ae --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/query.py @@ -0,0 +1,62 @@ +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.format import format_agency_ids_result +from src.db.helpers.session import session_helper as sh +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.url.lookup.response import \ + LookupURLForDataSourcesSyncResponse, URLDataSyncInfo +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupURLForDataSourcesSyncQueryBuilder(QueryBuilderBase): + """Look up provided URLs for corresponding database entries.""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[LookupURLForDataSourcesSyncResponse]: + url_id_label = "url_id" + data_source_id_label = "data_source_id" + agency_ids_label = "agency_ids" + + query = ( + select( + URL.url, + URL.id.label(url_id_label), + URLDataSource.data_source_id.label(data_source_id_label), + func.json_agg(LinkURLAgency.agency_id).label(agency_ids_label) + ).select_from(URL) + .outerjoin(URLDataSource) + .outerjoin(LinkURLAgency) + .where( + URL.url.in_( + self.urls + ) + ) + .group_by( + URL.url, + URL.id, + URLDataSource.data_source_id + ) + ) + + db_results = await sh.mappings(session=session, query=query) + + final_results = [] + for db_result in db_results: + final_results.append( + LookupURLForDataSourcesSyncResponse( + data_source_id=db_result[data_source_id_label], + url_info=URLDataSyncInfo( + url=db_result["url"], + url_id=db_result[url_id_label], + agency_ids=format_agency_ids_result(db_result[agency_ids_label]) + ) + ) + ) + + return final_results diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py new file mode 100644 index 00000000..845a6589 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/lookup/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +class URLDataSyncInfo(BaseModel): + url: str + url_id: int + agency_ids: list[int] + +class LookupURLForDataSourcesSyncResponse(BaseModel): + data_source_id: int | None + url_info: URLDataSyncInfo | None diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/__init__.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py new file mode 100644 index 00000000..fb8a9d64 --- /dev/null +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/update/params.py @@ -0,0 +1,21 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.templates.markers.bulk.update import BulkUpdatableModel + + +class UpdateURLForDataSourcesSyncParams(BulkUpdatableModel): + + @classmethod + def id_field(cls) -> str: + return "id" + + @classmethod + def sa_model(cls) -> type[URL]: + return URL + + id: int + name: str + description: str | None + outcome: URLStatus + record_type: RecordType diff --git a/src/core/tasks/scheduled/operators/agency_sync/exceptions.py b/src/core/tasks/scheduled/sync/exceptions.py similarity index 100% rename from src/core/tasks/scheduled/operators/agency_sync/exceptions.py rename to src/core/tasks/scheduled/sync/exceptions.py diff --git a/src/core/tasks/scheduled/templates/__init__.py b/src/core/tasks/scheduled/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/operators/base.py b/src/core/tasks/scheduled/templates/operator.py similarity index 100% rename from src/core/tasks/scheduled/operators/base.py rename to src/core/tasks/scheduled/templates/operator.py diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index d93143aa..993807fd 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -2,7 +2,7 @@ from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py index 27459145..0c814cb2 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py @@ -1,13 +1,11 @@ -from typing import Any - from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus, CollectorType from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/auto_relevant/core.py b/src/core/tasks/url/operators/auto_relevant/core.py index 1a0c6c13..d696cc31 100644 --- a/src/core/tasks/url/operators/auto_relevant/core.py +++ b/src/core/tasks/url/operators/auto_relevant/core.py @@ -3,8 +3,8 @@ from src.core.tasks.url.operators.auto_relevant.sort import separate_success_and_error_subsets from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.huggingface.inference.models.input import BasicInput diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index b444b5b3..78e4c983 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -7,8 +7,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/record_type/core.py b/src/core/tasks/url/operators/record_type/core.py index ce73ceb4..56abc6fc 100644 --- a/src/core/tasks/url/operators/record_type/core.py +++ b/src/core/tasks/url/operators/record_type/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.record_type.tdo import URLRecordTypeTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/submit_approved_url/core.py b/src/core/tasks/url/operators/submit_approved_url/core.py index dd2df39e..d2e20c3a 100644 --- a/src/core/tasks/url/operators/submit_approved_url/core.py +++ b/src/core/tasks/url/operators/submit_approved_url/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/url_html/core.py index 495845a4..39a09546 100644 --- a/src/core/tasks/url/operators/url_html/core.py +++ b/src/core/tasks/url/operators/url_html/core.py @@ -1,8 +1,8 @@ from http import HTTPStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.core.tasks.url.operators.url_html.tdo import UrlHtmlTDO diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py index 6af92abe..ff7f7c10 100644 --- a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py @@ -1,8 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.dto_converter import DTOConverter -from src.db.dtos.url.core import URLInfo -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/url_html/tdo.py b/src/core/tasks/url/operators/url_html/tdo.py index 7fe14078..326412a3 100644 --- a/src/core/tasks/url/operators/url_html/tdo.py +++ b/src/core/tasks/url/operators/url_html/tdo.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py b/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py index 988fbe8b..446c32c4 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py +++ b/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py @@ -1,7 +1,7 @@ from typing import Optional from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index c4c9892f..e5add9ce 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -1,12 +1,10 @@ -from typing import Any - from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload from src.collectors.enums import CollectorType from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from src.db.dtos.url.html_content import HTMLContentType -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 45505be5..fe4a498e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -3,10 +3,9 @@ from operator import or_ from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, text, Row -from sqlalchemy.dialects import postgresql +from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row from sqlalchemy.dialects.postgresql import insert as pg_insert -from sqlalchemy.exc import IntegrityError, NoResultFound +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -42,19 +41,30 @@ from src.api.endpoints.review.approve.query import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse +from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo - from src.api.endpoints.task.by_id.query import GetTaskInfoQueryBuilder from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse, GetTasksResponseTaskInfo from src.api.endpoints.url.get.dto import GetURLsResponseInfo - from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder +from src.core.tasks.scheduled.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query +from src.core.tasks.scheduled.sync.agency.queries.update_sync_progress import get_update_agencies_sync_progress_query +from src.core.tasks.scheduled.sync.agency.queries.upsert import \ + convert_agencies_sync_response_to_agencies_upsert +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.queries.get_sync_params import GetDataSourcesSyncParametersQueryBuilder +from src.core.tasks.scheduled.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query +from src.core.tasks.scheduled.sync.data_sources.queries.update_sync_progress import \ + get_update_data_sources_sync_progress_query +from src.core.tasks.scheduled.sync.data_sources.queries.upsert.core import \ + UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ @@ -71,39 +81,40 @@ from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.db.helpers.session import session_helper as sh from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.dto_converter import DTOConverter -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo, DuplicateInfo -from src.db.dtos.log import LogInfo, LogOutputInfo -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput -from src.db.dtos.url.core import URLInfo -from src.db.dtos.url.error import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot -from src.db.models.instantiations.batch import Batch -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.duplicate import Duplicate -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.link.link_task_url import LinkTaskURL -from src.db.models.instantiations.log import Log +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.task_url import LinkTaskURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.instantiations.log.pydantic.output import LogOutputInfo +from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.root_url_cache import RootURL -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.error_info import URLErrorInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 @@ -111,19 +122,21 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.templates import Base from src.db.queries.base.builder import QueryBuilderBase -from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder -from src.db.queries.implementations.core.tasks.agency_sync.upsert import get_upsert_agencies_mappings from src.db.statement_composer import StatementComposer +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo class AsyncDatabaseClient: @@ -168,12 +181,22 @@ async def execute(self, session: AsyncSession, statement): await session.execute(statement) @session_manager - async def add(self, session: AsyncSession, model: Base): - session.add(model) + async def add( + self, + session: AsyncSession, + model: Base, + return_id: bool = False + ) -> int | None: + return await sh.add(session=session, model=model, return_id=return_id) @session_manager - async def add_all(self, session: AsyncSession, models: list[Base]): - session.add_all(models) + async def add_all( + self, + session: AsyncSession, + models: list[Base], + return_ids: bool = False + ) -> list[int] | None: + return await sh.add_all(session=session, models=models, return_ids=return_ids) @session_manager async def bulk_update( @@ -192,42 +215,30 @@ async def bulk_update( async def bulk_upsert( self, session: AsyncSession, - model: Base, - mappings: list[dict], - id_value: str = "id" + models: list[BulkUpsertableModel], ): + return await sh.bulk_upsert(session, models) - query = pg_insert(model) - - set_ = {} - for k, v in mappings[0].items(): - if k == id_value: - continue - set_[k] = getattr(query.excluded, k) - - query = query.on_conflict_do_update( - index_elements=[id_value], - set_=set_ - ) - - - # Note, mapping must include primary key - await session.execute( - query, - mappings - ) + @session_manager + async def bulk_delete( + self, + session: AsyncSession, + models: list[BulkDeletableModel], + ): + return await sh.bulk_delete(session, models) @session_manager async def scalar(self, session: AsyncSession, statement): - return (await session.execute(statement)).scalar() + """Fetch the first column of the first row.""" + return await sh.scalar(session, statement) @session_manager async def scalars(self, session: AsyncSession, statement): - return (await session.execute(statement)).scalars().all() + return await sh.scalars(session, statement) @session_manager async def mapping(self, session: AsyncSession, statement): - return (await session.execute(statement)).mappings().one() + return await sh.mapping(session, statement) @session_manager async def run_query_builder( @@ -573,15 +584,9 @@ async def get_all( model: Base, order_by_attribute: Optional[str] = None ) -> list[Base]: - """ - Get all records of a model - Used primarily in testing - """ - statement = select(model) - if order_by_attribute: - statement = statement.order_by(getattr(model, order_by_attribute)) - result = await session.execute(statement) - return result.scalars().all() + """Get all records of a model. Used primarily in testing.""" + return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) + @session_manager async def load_root_url_cache(self, session: AsyncSession) -> dict[str, str]: @@ -755,14 +760,17 @@ async def upsert_new_agencies( Add or update agencies in the database """ for suggestion in suggestions: - agency = Agency( - agency_id=suggestion.pdap_agency_id, - name=suggestion.agency_name, - state=suggestion.state, - county=suggestion.county, - locality=suggestion.locality - ) - await session.merge(agency) + query = select(Agency).where(Agency.agency_id == suggestion.pdap_agency_id) + result = await session.execute(query) + agency = result.scalars().one_or_none() + if agency is None: + agency = Agency(agency_id=suggestion.pdap_agency_id) + agency.name = suggestion.agency_name + agency.state = suggestion.state + agency.county = suggestion.county + agency.locality = suggestion.locality + session.add(agency) + @session_manager async def add_confirmed_agency_url_links( @@ -771,7 +779,7 @@ async def add_confirmed_agency_url_links( suggestions: list[URLAgencySuggestionInfo] ): for suggestion in suggestions: - confirmed_agency = ConfirmedURLAgency( + confirmed_agency = LinkURLAgency( url_id=suggestion.url_id, agency_id=suggestion.pdap_agency_id ) @@ -824,7 +832,7 @@ async def add_agency_manual_suggestion( @session_manager async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: - statement = select(URL).where(exists().where(ConfirmedURLAgency.url_id == URL.id)) + statement = select(URL).where(exists().where(LinkURLAgency.url_id == URL.id)) results = await session.execute(statement) return list(results.scalars().all()) @@ -1296,10 +1304,6 @@ def case_column(status: URLStatus, label): oldest_pending_url_created_at=oldest_pending_created_at, ) - def compile(self, statement): - compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) - return compiled_sql - @session_manager async def get_urls_breakdown_pending_metrics( self, @@ -1566,56 +1570,45 @@ async def get_urls_aggregated_pending_metrics( ) return result - @session_manager - async def get_agencies_sync_parameters( - self, - session: AsyncSession - ) -> AgencySyncParameters: - query = select( - AgenciesSyncState.current_page, - AgenciesSyncState.current_cutoff_date + async def get_agencies_sync_parameters(self) -> AgencySyncParameters: + return await self.run_query_builder( + GetAgenciesSyncParametersQueryBuilder() ) - try: - result = (await session.execute(query)).mappings().one() - return AgencySyncParameters( - page=result['current_page'], - cutoff_date=result['current_cutoff_date'] - ) - except NoResultFound: - # Add value - state = AgenciesSyncState() - session.add(state) - return AgencySyncParameters(page=None, cutoff_date=None) - + async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: + return await self.run_query_builder( + GetDataSourcesSyncParametersQueryBuilder() + ) async def upsert_agencies( self, agencies: list[AgenciesSyncResponseInnerInfo] ): await self.bulk_upsert( - model=Agency, - mappings=get_upsert_agencies_mappings(agencies), - id_value="agency_id", + models=convert_agencies_sync_response_to_agencies_upsert(agencies) ) - async def update_agencies_sync_progress(self, page: int): - query = update( - AgenciesSyncState - ).values( - current_page=page + async def upsert_urls_from_data_sources( + self, + data_sources: list[DataSourcesSyncResponseInnerInfo] + ): + await self.run_query_builder( + UpsertURLsFromDataSourcesQueryBuilder( + sync_infos=data_sources + ) ) - await self.execute(query) + + async def update_agencies_sync_progress(self, page: int): + await self.execute(get_update_agencies_sync_progress_query(page)) + + async def update_data_sources_sync_progress(self, page: int): + await self.execute(get_update_data_sources_sync_progress_query(page)) + + async def mark_full_data_sources_sync(self): + await self.execute(get_mark_full_data_sources_sync_query()) async def mark_full_agencies_sync(self): - query = update( - AgenciesSyncState - ).values( - last_full_sync_at=func.now(), - current_cutoff_date=func.now() - text('interval \'1 day\''), - current_page=None - ) - await self.execute(query) + await self.execute(get_mark_full_agencies_sync_query()) @session_manager async def get_html_for_url( diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 8ec13085..361cb25a 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -7,19 +7,19 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.log import LogInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.templates import Base -from src.db.models.instantiations.duplicate import Duplicate -from src.db.models.instantiations.log import Log -from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.models.instantiations.log.sqlalchemy import Log +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus @@ -119,7 +119,7 @@ def insert_url(self, session, url_info: URLInfo) -> int: url_entry = URL( url=url_info.url, collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value, + outcome=url_info.outcome, name=url_info.name ) if url_info.created_at is not None: diff --git a/src/db/client/types.py b/src/db/client/types.py index 5ee28c10..8b004e19 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 80cbcd93..0b2379ef 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 5397c803..ed2d361c 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -8,16 +8,15 @@ from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo -from src.db.dtos.url.core import URLInfo from src.db.dtos.url.with_html import URLWithHTML -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.html_content import URLHTMLContent -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion @@ -129,7 +128,7 @@ def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( @staticmethod def confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies: list[ConfirmedURLAgency] + confirmed_agencies: list[LinkURLAgency] ) -> list[GetNextURLForAgencyAgencyInfo]: results = [] for confirmed_agency in confirmed_agencies: @@ -149,7 +148,7 @@ def confirmed_agencies_to_final_review_annotation_agency_info( @staticmethod def final_review_annotation_agency_info( automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agencies: list[ConfirmedURLAgency], + confirmed_agencies: list[LinkURLAgency], user_agency_suggestion: UserUrlAgencySuggestion ): diff --git a/src/db/dtos/duplicate.py b/src/db/dtos/duplicate.py deleted file mode 100644 index d978f91e..00000000 --- a/src/db/dtos/duplicate.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - - -class DuplicateInsertInfo(BaseModel): - original_url_id: int - duplicate_batch_id: int - -class DuplicateInfo(DuplicateInsertInfo): - source_url: str - original_batch_id: int - duplicate_metadata: dict - original_metadata: dict \ No newline at end of file diff --git a/src/db/dtos/metadata_annotation.py b/src/db/dtos/metadata_annotation.py deleted file mode 100644 index 5a004cf1..00000000 --- a/src/db/dtos/metadata_annotation.py +++ /dev/null @@ -1,11 +0,0 @@ -from datetime import datetime - -from pydantic import BaseModel - - -class MetadataAnnotationInfo(BaseModel): - id: int - user_id: int - metadata_id: int - value: str - created_at: datetime diff --git a/src/db/dtos/url/metadata.py b/src/db/dtos/url/metadata.py deleted file mode 100644 index acac01b8..00000000 --- a/src/db/dtos/url/metadata.py +++ /dev/null @@ -1,19 +0,0 @@ -from datetime import datetime -from typing import Optional - -from pydantic import BaseModel - -from src.db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource - - -class URLMetadataInfo(BaseModel): - id: Optional[int] = None - url_id: Optional[int] = None - attribute: Optional[URLMetadataAttributeType] = None - # TODO: May need to add validation here depending on the type of attribute - value: Optional[str] = None - notes: Optional[str] = None - validation_status: Optional[ValidationStatus] = None - validation_source: Optional[ValidationSource] = None - created_at: Optional[datetime] = None - updated_at: Optional[datetime] = None \ No newline at end of file diff --git a/src/db/enums.py b/src/db/enums.py index 0a45addd..25701485 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -42,6 +42,12 @@ class TaskType(PyEnum): IDLE = "Idle" PROBE_404 = "404 Probe" SYNC_AGENCIES = "Sync Agencies" + SYNC_DATA_SOURCES = "Sync Data Sources" + +class ChangeLogOperationType(PyEnum): + INSERT = "INSERT" + UPDATE = "UPDATE" + DELETE = "DELETE" class PGEnum(TypeDecorator): impl = postgresql.ENUM @@ -51,3 +57,4 @@ def process_bind_param(self, value: PyEnum, dialect): if isinstance(value, PyEnum): return value.value return value + diff --git a/src/db/helpers.py b/src/db/helpers.py index 618b2e6d..10151935 100644 --- a/src/db/helpers.py +++ b/src/db/helpers.py @@ -1,5 +1,3 @@ from src.core.env_var_manager import EnvVarManager -def get_postgres_connection_string(is_async = False): - return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/src/db/helpers/__init__.py b/src/db/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/helpers/connect.py b/src/db/helpers/connect.py new file mode 100644 index 00000000..618b2e6d --- /dev/null +++ b/src/db/helpers/connect.py @@ -0,0 +1,5 @@ +from src.core.env_var_manager import EnvVarManager + + +def get_postgres_connection_string(is_async = False): + return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/src/db/helpers/session/__init__.py b/src/db/helpers/session/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/helpers/session/parser.py b/src/db/helpers/session/parser.py new file mode 100644 index 00000000..bc822022 --- /dev/null +++ b/src/db/helpers/session/parser.py @@ -0,0 +1,41 @@ +from src.db.helpers.session.types import BulkActionType +from src.db.models.templates import Base +from src.db.templates.protocols.sa_correlated.core import SQLAlchemyCorrelatedProtocol +from src.db.templates.protocols.sa_correlated.with_id import SQLAlchemyCorrelatedWithIDProtocol +from src.db.utils.validate import validate_all_models_of_same_type + + +class BulkActionParser: + + def __init__( + self, + models: list[BulkActionType], + ): + validate_all_models_of_same_type(models) + model_class = type(models[0]) + self.models = models + self.model_class = model_class + + @property + def id_field(self) -> str: + if not issubclass(self.model_class, SQLAlchemyCorrelatedWithIDProtocol): + raise TypeError("Model must implement SQLAlchemyCorrelatedWithID protocol.") + + return self.model_class.id_field() + + @property + def sa_model(self) -> type[Base]: + if not issubclass(self.model_class, SQLAlchemyCorrelatedProtocol): + raise TypeError(f"Model {self.model_class} must implement SQLAlchemyCorrelated protocol.") + return self.model_class.sa_model() + + def get_non_id_fields(self) -> list[str]: + return [ + field for field in self.model_class.model_fields.keys() + if field != self.id_field + ] + + def get_all_fields(self) -> list[str]: + return [ + field for field in self.model_class.model_fields.keys() + ] diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py new file mode 100644 index 00000000..2b3776c1 --- /dev/null +++ b/src/db/helpers/session/session_helper.py @@ -0,0 +1,214 @@ +""" +session_helper (aliased as sh) contains a number of convenience +functions for workings with a SQLAlchemy session +""" +from typing import Any, Optional, Sequence + +import sqlalchemy as sa +from sqlalchemy import update, ColumnElement, Row +from sqlalchemy.dialects import postgresql +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session.parser import BulkActionParser +from src.db.models.templates import Base, StandardBase +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.update import BulkUpdatableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel +from src.db.templates.protocols.has_id import HasIDProtocol + + +async def one_or_none( + session: AsyncSession, + query: sa.Select +) -> sa.Row | None: + raw_result = await session.execute(query) + return raw_result.scalars().one_or_none() + +async def scalar(session: AsyncSession, query: sa.Select) -> Any: + """Fetch the first column of the first row.""" + raw_result = await session.execute(query) + return raw_result.scalar() + +async def scalars(session: AsyncSession, query: sa.Select) -> Any: + raw_result = await session.execute(query) + return raw_result.scalars().all() + +async def mapping(session: AsyncSession, query: sa.Select) -> sa.RowMapping: + raw_result = await session.execute(query) + return raw_result.mappings().one() + +async def mappings(session: AsyncSession, query: sa.Select) -> Sequence[sa.RowMapping]: + raw_result = await session.execute(query) + return raw_result.mappings().all() + +async def bulk_upsert( + session: AsyncSession, + models: list[BulkUpsertableModel], +): + if len(models) == 0: + return + parser = BulkActionParser(models) + + query = pg_insert(parser.sa_model) + + upsert_mappings = [upsert_model.model_dump() for upsert_model in models] + + set_ = {} + for k, v in upsert_mappings[0].items(): + if k == parser.id_field: + continue + set_[k] = getattr(query.excluded, k) + + query = query.on_conflict_do_update( + index_elements=[parser.id_field], + set_=set_ + ) + + # Note, mapping must include primary key + await session.execute( + statement=query, + params=upsert_mappings + ) + +async def add( + session: AsyncSession, + model: Base, + return_id: bool = False +) -> int | None: + session.add(model) + if return_id: + if not isinstance(model, HasIDProtocol): + raise AttributeError("Models must have an id attribute") + await session.flush() + return model.id + return None + + +async def add_all( + session: AsyncSession, + models: list[StandardBase], + return_ids: bool = False +) -> list[int] | None: + session.add_all(models) + if return_ids: + if not isinstance(models[0], HasIDProtocol): + raise AttributeError("Models must have an id attribute") + await session.flush() + return [ + model.id # pyright: ignore [reportAttributeAccessIssue] + for model in models + ] + return None + +async def get_all( + session: AsyncSession, + model: Base, + order_by_attribute: Optional[str] = None +) -> Sequence[Row]: + """ + Get all records of a model + Used primarily in testing + """ + statement = sa.select(model) + if order_by_attribute: + statement = statement.order_by(getattr(model, order_by_attribute)) + result = await session.execute(statement) + return result.scalars().all() + +def compile_to_sql(statement) -> str: + compiled_sql = statement.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}) + return compiled_sql + + +async def bulk_delete(session: AsyncSession, models: list[BulkDeletableModel]): + """Bulk delete sqlalchemy models of the same type.""" + if len(models) == 0: + return + + parser = BulkActionParser(models) + + # Use declared field names from the model (excludes properties/methods) + field_names = parser.get_all_fields() + + sa_model = parser.sa_model + + # Get value tuples to be used in identifying attributes for bulk delete + value_tuples = [] + for model in models: + tup = tuple(getattr(model, field) for field in field_names) + value_tuples.append(tup) + + + statement = ( + sa.delete( + sa_model + ).where( + sa.tuple_( + *[ + getattr(sa_model, attr) + for attr in field_names + ] + ).in_(value_tuples) + ) + ) + + await session.execute(statement) + +async def bulk_insert( + session: AsyncSession, + models: list[BulkInsertableModel], + return_ids: bool = False +) -> list[int] | None: + """Bulk insert sqlalchemy models via their pydantic counterparts.""" + + if len(models) == 0: + return None + + parser = BulkActionParser(models) + sa_model = parser.sa_model + + models_to_add = [] + for model in models: + sa_model_instance = sa_model(**model.model_dump()) + models_to_add.append(sa_model_instance) + + return await add_all( + session=session, + models=models_to_add, + return_ids=return_ids + ) + +async def bulk_update( + session: AsyncSession, + models: list[BulkUpdatableModel], +): + """Bulk update sqlalchemy models via their pydantic counterparts.""" + if len(models) == 0: + return + + parser = BulkActionParser(models) + + sa_model = parser.sa_model + id_field = parser.id_field + update_fields = parser.get_non_id_fields() + + + for model in models: + update_values = { + k: getattr(model, k) + for k in update_fields + } + id_value = getattr(model, id_field) + id_attr: ColumnElement = getattr(sa_model, id_field) + stmt = ( + update(sa_model) + .where( + id_attr == id_value + ) + .values(**update_values) + ) + await session.execute(stmt) + + diff --git a/src/db/helpers/session/types.py b/src/db/helpers/session/types.py new file mode 100644 index 00000000..b960b76c --- /dev/null +++ b/src/db/helpers/session/types.py @@ -0,0 +1,8 @@ +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.update import BulkUpdatableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + +BulkActionType = ( + BulkInsertableModel | BulkUpdatableModel | BulkDeletableModel | BulkUpsertableModel +) diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index f72f06ba..6295415d 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -1,5 +1,5 @@ -from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey - +from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey, Enum as SAEnum +from enum import Enum as PyEnum def get_created_at_column(): return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) @@ -7,7 +7,7 @@ def get_created_at_column(): def get_agency_id_foreign_column( nullable: bool = False -): +) -> Column: return Column( 'agency_id', Integer(), @@ -15,4 +15,19 @@ def get_agency_id_foreign_column( nullable=nullable ) +def enum_column( + enum_type: type[PyEnum], + name: str, + nullable: bool = False +) -> Column[SAEnum]: + return Column( + SAEnum( + enum_type, + name=name, + native_enum=True, + values_callable=lambda enum_type: [e.value for e in enum_type] + ), + nullable=nullable + ) + CURRENT_TIME_SERVER_DEFAULT = func.now() diff --git a/src/db/models/instantiations/agency/__init__.py b/src/db/models/instantiations/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/agency/pydantic/__init__.py b/src/db/models/instantiations/agency/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/instantiations/agency/pydantic/upsert.py new file mode 100644 index 00000000..9a869e84 --- /dev/null +++ b/src/db/models/instantiations/agency/pydantic/upsert.py @@ -0,0 +1,23 @@ +from datetime import datetime + +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.templates import Base +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + + +class AgencyUpsertModel(BulkUpsertableModel): + + @classmethod + def id_field(cls) -> str: + return "agency_id" + + @classmethod + def sa_model(cls) -> type[Base]: + return Agency + + agency_id: int + name: str + state: str | None + county: str | None + locality: str | None + ds_last_updated_at: datetime diff --git a/src/db/models/instantiations/agency.py b/src/db/models/instantiations/agency/sqlalchemy.py similarity index 83% rename from src/db/models/instantiations/agency.py rename to src/db/models/instantiations/agency/sqlalchemy.py index 37beec3d..2ce3676f 100644 --- a/src/db/models/instantiations/agency.py +++ b/src/db/models/instantiations/agency/sqlalchemy.py @@ -6,16 +6,18 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import Base +from src.db.models.templates import Base, StandardBase class Agency( CreatedAtMixin, # When agency was added to database UpdatedAtMixin, # When agency was last updated in database - Base + StandardBase ): __tablename__ = "agencies" + # TODO: Rename agency_id to ds_agency_id + agency_id = Column(Integer, primary_key=True) name = Column(String, nullable=False) state = Column(String, nullable=True) @@ -30,4 +32,4 @@ class Agency( # Relationships automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") - confirmed_urls = relationship("ConfirmedURLAgency", back_populates="agency") + confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/instantiations/backlog_snapshot.py b/src/db/models/instantiations/backlog_snapshot.py index 240a82fd..89645160 100644 --- a/src/db/models/instantiations/backlog_snapshot.py +++ b/src/db/models/instantiations/backlog_snapshot.py @@ -1,10 +1,10 @@ from sqlalchemy import Column, Integer from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class BacklogSnapshot(CreatedAtMixin, StandardModel): +class BacklogSnapshot(CreatedAtMixin, StandardBase): __tablename__ = "backlog_snapshot" count_pending_total = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/batch/__init__.py b/src/db/models/instantiations/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/batch.py b/src/db/models/instantiations/batch/pydantic.py similarity index 100% rename from src/db/dtos/batch.py rename to src/db/models/instantiations/batch/pydantic.py diff --git a/src/db/models/instantiations/batch.py b/src/db/models/instantiations/batch/sqlalchemy.py similarity index 95% rename from src/db/models/instantiations/batch.py rename to src/db/models/instantiations/batch/sqlalchemy.py index 89645f4a..c1bf14fb 100644 --- a/src/db/models/instantiations/batch.py +++ b/src/db/models/instantiations/batch/sqlalchemy.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import batch_status_enum -class Batch(StandardModel): +class Batch(StandardBase): __tablename__ = 'batches' strategy = Column( diff --git a/src/db/models/instantiations/change_log.py b/src/db/models/instantiations/change_log.py new file mode 100644 index 00000000..975958ab --- /dev/null +++ b/src/db/models/instantiations/change_log.py @@ -0,0 +1,19 @@ + +from sqlalchemy import Column, Enum +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped + +from src.db.enums import ChangeLogOperationType +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates import StandardBase + + +class ChangeLog(CreatedAtMixin, StandardBase): + + __tablename__ = "change_log" + + operation_type = Column(Enum(ChangeLogOperationType, name="operation_type")) + table_name: Mapped[str] + affected_id: Mapped[int] + old_data = Column("old_data", JSONB, nullable=True) + new_data = Column("new_data", JSONB, nullable=True) diff --git a/src/db/models/instantiations/duplicate/__init__.py b/src/db/models/instantiations/duplicate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/duplicate/pydantic/__init__.py b/src/db/models/instantiations/duplicate/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/duplicate/pydantic/info.py b/src/db/models/instantiations/duplicate/pydantic/info.py new file mode 100644 index 00000000..3a020e04 --- /dev/null +++ b/src/db/models/instantiations/duplicate/pydantic/info.py @@ -0,0 +1,8 @@ +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo + + +class DuplicateInfo(DuplicateInsertInfo): + source_url: str + original_batch_id: int + duplicate_metadata: dict + original_metadata: dict diff --git a/src/db/models/instantiations/duplicate/pydantic/insert.py b/src/db/models/instantiations/duplicate/pydantic/insert.py new file mode 100644 index 00000000..f753e217 --- /dev/null +++ b/src/db/models/instantiations/duplicate/pydantic/insert.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class DuplicateInsertInfo(BaseModel): + original_url_id: int + duplicate_batch_id: int + diff --git a/src/db/models/instantiations/duplicate.py b/src/db/models/instantiations/duplicate/sqlalchemy.py similarity index 84% rename from src/db/models/instantiations/duplicate.py rename to src/db/models/instantiations/duplicate/sqlalchemy.py index 7a80d918..67df3af5 100644 --- a/src/db/models/instantiations/duplicate.py +++ b/src/db/models/instantiations/duplicate/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Duplicate(BatchDependentMixin, StandardModel): +class Duplicate(BatchDependentMixin, StandardBase): """ Identifies duplicates which occur within a batch """ diff --git a/src/db/models/instantiations/link/link_batch_urls.py b/src/db/models/instantiations/link/batch_url.py similarity index 84% rename from src/db/models/instantiations/link/link_batch_urls.py rename to src/db/models/instantiations/link/batch_url.py index f357ae6a..f40edc29 100644 --- a/src/db/models/instantiations/link/link_batch_urls.py +++ b/src/db/models/instantiations/link/batch_url.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin, BatchDependentMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class LinkBatchURL( @@ -9,7 +9,7 @@ class LinkBatchURL( CreatedAtMixin, URLDependentMixin, BatchDependentMixin, - StandardModel + StandardBase ): __tablename__ = "link_batch_urls" diff --git a/src/db/models/instantiations/link/link_task_url.py b/src/db/models/instantiations/link/task_url.py similarity index 100% rename from src/db/models/instantiations/link/link_task_url.py rename to src/db/models/instantiations/link/task_url.py diff --git a/src/db/models/instantiations/link/url_agency/__init__.py b/src/db/models/instantiations/link/url_agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/link/url_agency/pydantic.py b/src/db/models/instantiations/link/url_agency/pydantic.py new file mode 100644 index 00000000..75c02119 --- /dev/null +++ b/src/db/models/instantiations/link/url_agency/pydantic.py @@ -0,0 +1,15 @@ +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkURLAgencyPydantic( + BulkDeletableModel, + BulkInsertableModel +): + url_id: int + agency_id: int + + @classmethod + def sa_model(cls) -> type[LinkURLAgency]: + return LinkURLAgency \ No newline at end of file diff --git a/src/db/models/instantiations/confirmed_url_agency.py b/src/db/models/instantiations/link/url_agency/sqlalchemy.py similarity index 61% rename from src/db/models/instantiations/confirmed_url_agency.py rename to src/db/models/instantiations/link/url_agency/sqlalchemy.py index db63b114..28e42924 100644 --- a/src/db/models/instantiations/confirmed_url_agency.py +++ b/src/db/models/instantiations/link/url_agency/sqlalchemy.py @@ -1,15 +1,15 @@ from sqlalchemy import UniqueConstraint -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class ConfirmedURLAgency(URLDependentMixin, StandardModel): - __tablename__ = "confirmed_url_agency" +class LinkURLAgency(URLDependentMixin, StandardBase): + __tablename__ = "link_urls_agencies" - agency_id = get_agency_id_foreign_column() + agency_id: Mapped[int] = get_agency_id_foreign_column() url = relationship("URL", back_populates="confirmed_agencies") agency = relationship("Agency", back_populates="confirmed_urls") diff --git a/src/db/models/instantiations/log/__init__.py b/src/db/models/instantiations/log/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/log/pydantic/__init__.py b/src/db/models/instantiations/log/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/log.py b/src/db/models/instantiations/log/pydantic/info.py similarity index 65% rename from src/db/dtos/log.py rename to src/db/models/instantiations/log/pydantic/info.py index 43ed1cec..aa9b06ee 100644 --- a/src/db/dtos/log.py +++ b/src/db/models/instantiations/log/pydantic/info.py @@ -9,8 +9,3 @@ class LogInfo(BaseModel): log: str batch_id: int created_at: Optional[datetime] = None - -class LogOutputInfo(BaseModel): - id: Optional[int] = None - log: str - created_at: Optional[datetime] = None \ No newline at end of file diff --git a/src/db/models/instantiations/log/pydantic/output.py b/src/db/models/instantiations/log/pydantic/output.py new file mode 100644 index 00000000..c58eab0f --- /dev/null +++ b/src/db/models/instantiations/log/pydantic/output.py @@ -0,0 +1,10 @@ +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel + + +class LogOutputInfo(BaseModel): + id: Optional[int] = None + log: str + created_at: Optional[datetime] = None diff --git a/src/db/models/instantiations/log.py b/src/db/models/instantiations/log/sqlalchemy.py similarity index 72% rename from src/db/models/instantiations/log.py rename to src/db/models/instantiations/log/sqlalchemy.py index 756e10c5..769391cf 100644 --- a/src/db/models/instantiations/log.py +++ b/src/db/models/instantiations/log/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Log(CreatedAtMixin, BatchDependentMixin, StandardModel): +class Log(CreatedAtMixin, BatchDependentMixin, StandardBase): __tablename__ = 'logs' log = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/missing.py b/src/db/models/instantiations/missing.py index 0babd91d..05665eba 100644 --- a/src/db/models/instantiations/missing.py +++ b/src/db/models/instantiations/missing.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class Missing(BatchDependentMixin, StandardModel): +class Missing(BatchDependentMixin, StandardBase): __tablename__ = 'missing' place_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/root_url_cache.py b/src/db/models/instantiations/root_url_cache.py index d121ae28..4ebadd50 100644 --- a/src/db/models/instantiations/root_url_cache.py +++ b/src/db/models/instantiations/root_url_cache.py @@ -1,10 +1,10 @@ from sqlalchemy import UniqueConstraint, Column, String from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class RootURL(UpdatedAtMixin, StandardModel): +class RootURL(UpdatedAtMixin, StandardBase): __tablename__ = 'root_url_cache' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/sync_state/__init__.py b/src/db/models/instantiations/sync_state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/sync_state_agencies.py b/src/db/models/instantiations/sync_state/agencies.py similarity index 100% rename from src/db/models/instantiations/sync_state_agencies.py rename to src/db/models/instantiations/sync_state/agencies.py diff --git a/src/db/models/instantiations/sync_state/data_sources.py b/src/db/models/instantiations/sync_state/data_sources.py new file mode 100644 index 00000000..cf173860 --- /dev/null +++ b/src/db/models/instantiations/sync_state/data_sources.py @@ -0,0 +1,28 @@ +from sqlalchemy import Integer, Column, DateTime, Date + +from src.db.models.templates import Base + + +class DataSourcesSyncState(Base): + __tablename__ = 'data_sources_sync_state' + id = Column(Integer, primary_key=True) + last_full_sync_at = Column( + DateTime(), + nullable=True, + comment="The datetime of the last *full* sync " + "(i.e., the last sync that got all entries " + "available to be synchronized)." + ) + current_cutoff_date = Column( + Date(), + nullable=True, + comment="Tracks the cutoff date passed to the data sources sync endpoint." + "On completion of a full sync, this is set to " + "the day before the present day." + ) + current_page = Column( + Integer(), + nullable=True, + comment="Tracks the current page passed to the data sources sync endpoint." + "On completion of a full sync, this is set to `null`." + ) \ No newline at end of file diff --git a/src/db/models/instantiations/task/core.py b/src/db/models/instantiations/task/core.py index 89c80405..514301c8 100644 --- a/src/db/models/instantiations/task/core.py +++ b/src/db/models/instantiations/task/core.py @@ -3,11 +3,11 @@ from src.db.enums import PGEnum, TaskType from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import batch_status_enum -class Task(UpdatedAtMixin, StandardModel): +class Task(UpdatedAtMixin, StandardBase): __tablename__ = 'tasks' task_type = Column( diff --git a/src/db/models/instantiations/task/error.py b/src/db/models/instantiations/task/error.py index cf1ae24f..03014904 100644 --- a/src/db/models/instantiations/task/error.py +++ b/src/db/models/instantiations/task/error.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardModel): +class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardBase): __tablename__ = 'task_errors' error = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/url/checked_for_duplicate.py b/src/db/models/instantiations/url/checked_for_duplicate.py index d5811c6e..9443d0ac 100644 --- a/src/db/models/instantiations/url/checked_for_duplicate.py +++ b/src/db/models/instantiations/url/checked_for_duplicate.py @@ -1,10 +1,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardModel): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_checked_for_duplicate' # Relationships diff --git a/src/db/models/instantiations/url/compressed_html.py b/src/db/models/instantiations/url/compressed_html.py index 5c2e06c0..206348ac 100644 --- a/src/db/models/instantiations/url/compressed_html.py +++ b/src/db/models/instantiations/url/compressed_html.py @@ -2,13 +2,13 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = 'url_compressed_html' diff --git a/src/db/models/instantiations/url/core/__init__.py b/src/db/models/instantiations/url/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/core.py b/src/db/models/instantiations/url/core/pydantic.py similarity index 100% rename from src/db/dtos/url/core.py rename to src/db/models/instantiations/url/core/pydantic.py diff --git a/src/db/models/instantiations/url/core.py b/src/db/models/instantiations/url/core/sqlalchemy.py similarity index 82% rename from src/db/models/instantiations/url/core.py rename to src/db/models/instantiations/url/core/sqlalchemy.py index 8e9860fc..c20343b6 100644 --- a/src/db/models/instantiations/url/core.py +++ b/src/db/models/instantiations/url/core/sqlalchemy.py @@ -1,13 +1,16 @@ -from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON +from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON, Enum from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.helpers import enum_column from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values -class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): +class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): __tablename__ = 'urls' # The batch this URL is associated with @@ -17,21 +20,16 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - outcome = Column( - postgresql.ENUM( - 'pending', - 'submitted', - 'validated', - 'not relevant', - 'duplicate', - 'error', - '404 not found', - 'individual record', - name='url_status' - ), - nullable=False + outcome = enum_column( + URLStatus, + name='url_status', + nullable=False + ) + record_type = enum_column( + RecordType, + name='record_type', + nullable=True ) - record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=True) # Relationships batch = relationship( @@ -65,7 +63,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardModel): optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( - "ConfirmedURLAgency", + "LinkURLAgency", ) data_source = relationship( "URLDataSource", diff --git a/src/db/models/instantiations/url/data_source/__init__.py b/src/db/models/instantiations/url/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/data_source/pydantic.py b/src/db/models/instantiations/url/data_source/pydantic.py new file mode 100644 index 00000000..00da8c5e --- /dev/null +++ b/src/db/models/instantiations/url/data_source/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLDataSourcePydantic(BulkInsertableModel): + data_source_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[URLDataSource]: + return URLDataSource \ No newline at end of file diff --git a/src/db/models/instantiations/url/data_source.py b/src/db/models/instantiations/url/data_source/sqlalchemy.py similarity index 75% rename from src/db/models/instantiations/url/data_source.py rename to src/db/models/instantiations/url/data_source/sqlalchemy.py index ad6caf46..b5bdb40d 100644 --- a/src/db/models/instantiations/url/data_source.py +++ b/src/db/models/instantiations/url/data_source/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardModel): +class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "url_data_sources" data_source_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/error_info/__init__.py b/src/db/models/instantiations/url/error_info/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/error.py b/src/db/models/instantiations/url/error_info/pydantic.py similarity index 100% rename from src/db/dtos/url/error.py rename to src/db/models/instantiations/url/error_info/pydantic.py diff --git a/src/db/models/instantiations/url/error_info.py b/src/db/models/instantiations/url/error_info/sqlalchemy.py similarity index 88% rename from src/db/models/instantiations/url/error_info.py rename to src/db/models/instantiations/url/error_info/sqlalchemy.py index d2a09b6a..8825777f 100644 --- a/src/db/models/instantiations/url/error_info.py +++ b/src/db/models/instantiations/url/error_info/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardModel): +class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_error_info' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/html_content.py b/src/db/models/instantiations/url/html_content.py index 39ad3666..b23af35c 100644 --- a/src/db/models/instantiations/url/html_content.py +++ b/src/db/models/instantiations/url/html_content.py @@ -3,10 +3,10 @@ from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardModel): +class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'url_html_content' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/optional_data_source_metadata.py b/src/db/models/instantiations/url/optional_data_source_metadata.py index 84871982..fac99828 100644 --- a/src/db/models/instantiations/url/optional_data_source_metadata.py +++ b/src/db/models/instantiations/url/optional_data_source_metadata.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLOptionalDataSourceMetadata(URLDependentMixin, StandardModel): +class URLOptionalDataSourceMetadata(URLDependentMixin, StandardBase): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) diff --git a/src/db/models/instantiations/url/probed_for_404.py b/src/db/models/instantiations/url/probed_for_404.py index 3913e37e..b795b628 100644 --- a/src/db/models/instantiations/url/probed_for_404.py +++ b/src/db/models/instantiations/url/probed_for_404.py @@ -2,10 +2,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class URLProbedFor404(URLDependentMixin, StandardModel): +class URLProbedFor404(URLDependentMixin, StandardBase): __tablename__ = 'url_probed_for_404' last_probed_at = get_created_at_column() diff --git a/src/db/models/instantiations/url/reviewing_user.py b/src/db/models/instantiations/url/reviewing_user.py index d28a33e7..938f86ab 100644 --- a/src/db/models/instantiations/url/reviewing_user.py +++ b/src/db/models/instantiations/url/reviewing_user.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardModel): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = 'reviewing_user_url' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/url/suggestion/agency/auto.py b/src/db/models/instantiations/url/suggestion/agency/auto.py index 5831882f..01585535 100644 --- a/src/db/models/instantiations/url/suggestion/agency/auto.py +++ b/src/db/models/instantiations/url/suggestion/agency/auto.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardModel): +class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): __tablename__ = "automated_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/agency/user.py b/src/db/models/instantiations/url/suggestion/agency/user.py index cb92bfc0..5a54399f 100644 --- a/src/db/models/instantiations/url/suggestion/agency/user.py +++ b/src/db/models/instantiations/url/suggestion/agency/user.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class UserUrlAgencySuggestion(URLDependentMixin, StandardModel): +class UserUrlAgencySuggestion(URLDependentMixin, StandardBase): __tablename__ = "user_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/record_type/auto.py b/src/db/models/instantiations/url/suggestion/record_type/auto.py index 00d738b8..34faf6f3 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/auto.py +++ b/src/db/models/instantiations/url/suggestion/record_type/auto.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values @@ -11,7 +11,7 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/record_type/user.py b/src/db/models/instantiations/url/suggestion/record_type/user.py index cda6fb17..77954509 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/user.py +++ b/src/db/models/instantiations/url/suggestion/record_type/user.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardModel): +class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "user_record_type_suggestions" user_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py b/src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py b/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/annotations/auto/relevancy.py b/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py similarity index 100% rename from src/db/dtos/url/annotations/auto/relevancy.py rename to src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto.py b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py similarity index 88% rename from src/db/models/instantiations/url/suggestion/relevant/auto.py rename to src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py index db7f8ea2..982b4449 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/auto.py +++ b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardModel): +class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/relevant/user.py b/src/db/models/instantiations/url/suggestion/relevant/user.py index 35d30c44..b087f71e 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/user.py +++ b/src/db/models/instantiations/url/suggestion/relevant/user.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardModel +from src.db.models.templates import StandardBase class UserRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardModel + StandardBase ): __tablename__ = "user_relevant_suggestions" diff --git a/src/db/models/templates.py b/src/db/models/templates.py index 3e0a1c95..5e738fab 100644 --- a/src/db/models/templates.py +++ b/src/db/models/templates.py @@ -4,7 +4,7 @@ # Base class for SQLAlchemy ORM models Base = declarative_base() -class StandardModel(Base): +class StandardBase(Base): __abstract__ = True id = Column(Integer, primary_key=True, autoincrement=True) diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index 5806ef47..4b5fd118 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -1,9 +1,9 @@ from typing import Any, Generic, Optional from sqlalchemy import FromClause, ColumnClause -from sqlalchemy.dialects import postgresql from sqlalchemy.ext.asyncio import AsyncSession +from src.db.helpers.session import session_helper as sh from src.db.types import LabelsType @@ -33,9 +33,4 @@ async def run(self, session: AsyncSession) -> Any: @staticmethod def compile(query) -> Any: - return query.compile( - dialect=postgresql.dialect(), - compile_kwargs={ - "literal_binds": True - } - ) + return sh.compile_to_sql(query) diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index 656b56f3..41a8fc8d 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -18,7 +18,7 @@ from src.collectors.enums import URLStatus from src.db.constants import ALL_ANNOTATION_MODELS -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index 8ac1b4af..bd16f149 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -7,7 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 571db2a0..d1ab774e 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -5,9 +5,9 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 503af6c3..5e27496a 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -1,11 +1,11 @@ from typing import Any, Type -from sqlalchemy import select, func, case +from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion diff --git a/src/db/queries/implementations/core/tasks/agency_sync/upsert.py b/src/db/queries/implementations/core/tasks/agency_sync/upsert.py deleted file mode 100644 index cff2044b..00000000 --- a/src/db/queries/implementations/core/tasks/agency_sync/upsert.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo - - -def get_upsert_agencies_mappings( - agencies: list[AgenciesSyncResponseInnerInfo] -) -> list[dict]: - agency_dicts = [] - for agency in agencies: - agency_dict = { - 'agency_id': agency.agency_id, - 'name': agency.display_name, - 'state': agency.state_name, - 'county': agency.county_name, - 'locality': agency.locality_name, - 'ds_last_updated_at': agency.updated_at - } - agency_dicts.append(agency_dict) - - return agency_dicts \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 9d5faa97..518aafc2 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -7,14 +7,14 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL -from src.db.models.instantiations.link.link_task_url import LinkTaskURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.link.task_url import LinkTaskURL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.url.html_content import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.types import UserSuggestionType @@ -81,7 +81,7 @@ def exclude_urls_with_agency_suggestions( ) # Exclude if confirmed agencies exist statement = statement.where( - ~exists().where(ConfirmedURLAgency.url_id == URL.id) + ~exists().where(LinkURLAgency.url_id == URL.id) ) return statement diff --git a/src/db/templates/__init__.py b/src/db/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/markers/__init__.py b/src/db/templates/markers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/markers/bulk/__init__.py b/src/db/templates/markers/bulk/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/markers/bulk/delete.py b/src/db/templates/markers/bulk/delete.py new file mode 100644 index 00000000..9da0c980 --- /dev/null +++ b/src/db/templates/markers/bulk/delete.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class BulkDeletableModel(BaseModel): + """Identifies a model that can be used for the bulk_delete function in session_helper.""" + diff --git a/src/db/templates/markers/bulk/insert.py b/src/db/templates/markers/bulk/insert.py new file mode 100644 index 00000000..d147e44f --- /dev/null +++ b/src/db/templates/markers/bulk/insert.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkInsertableModel(BaseModel): + """Identifies a model that can be used for the bulk_insert function in session_helper.""" diff --git a/src/db/templates/markers/bulk/update.py b/src/db/templates/markers/bulk/update.py new file mode 100644 index 00000000..d0476135 --- /dev/null +++ b/src/db/templates/markers/bulk/update.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkUpdatableModel(BaseModel): + """Identifies a model that can be used for the bulk_update function in session_helper.""" diff --git a/src/db/templates/markers/bulk/upsert.py b/src/db/templates/markers/bulk/upsert.py new file mode 100644 index 00000000..86d683bb --- /dev/null +++ b/src/db/templates/markers/bulk/upsert.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class BulkUpsertableModel(BaseModel): + """Identifies a model that can be used for the bulk_upsert function in session_helper.""" \ No newline at end of file diff --git a/src/db/templates/protocols/__init__.py b/src/db/templates/protocols/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/protocols/has_id.py b/src/db/templates/protocols/has_id.py new file mode 100644 index 00000000..fc3519a2 --- /dev/null +++ b/src/db/templates/protocols/has_id.py @@ -0,0 +1,6 @@ +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class HasIDProtocol(Protocol): + id: int \ No newline at end of file diff --git a/src/db/templates/protocols/sa_correlated/__init__.py b/src/db/templates/protocols/sa_correlated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/templates/protocols/sa_correlated/core.py b/src/db/templates/protocols/sa_correlated/core.py new file mode 100644 index 00000000..6b77c835 --- /dev/null +++ b/src/db/templates/protocols/sa_correlated/core.py @@ -0,0 +1,15 @@ +from abc import abstractmethod +from typing import Protocol, runtime_checkable + +from src.db.models.templates import Base + + +@runtime_checkable +class SQLAlchemyCorrelatedProtocol(Protocol): + + + @classmethod + @abstractmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + pass diff --git a/src/db/templates/protocols/sa_correlated/with_id.py b/src/db/templates/protocols/sa_correlated/with_id.py new file mode 100644 index 00000000..4e3609e1 --- /dev/null +++ b/src/db/templates/protocols/sa_correlated/with_id.py @@ -0,0 +1,20 @@ +from abc import abstractmethod +from typing import Protocol, runtime_checkable + +from src.db.models.templates import Base + + +@runtime_checkable +class SQLAlchemyCorrelatedWithIDProtocol(Protocol): + + @classmethod + @abstractmethod + def id_field(cls) -> str: + """Defines the field to be used as the primary key.""" + return "id" + + @classmethod + @abstractmethod + def sa_model(cls) -> type[Base]: + """Defines the correlated SQLAlchemy model.""" + pass diff --git a/src/db/utils/validate.py b/src/db/utils/validate.py new file mode 100644 index 00000000..077b7752 --- /dev/null +++ b/src/db/utils/validate.py @@ -0,0 +1,13 @@ +from typing import Protocol + +from pydantic import BaseModel + + +def validate_has_protocol(obj: object, protocol: type[Protocol]): + if not isinstance(obj, protocol): + raise TypeError(f"Class must implement {protocol} protocol.") + +def validate_all_models_of_same_type(objects: list[object]): + first_model = objects[0] + if not all(isinstance(model, type(first_model)) for model in objects): + raise TypeError("Models must be of the same type") \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 126e7970..a68179fe 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -2,11 +2,13 @@ from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo from src.external.pdap.enums import MatchAgencyResponseStatus @@ -175,4 +177,34 @@ async def sync_agencies( AgenciesSyncResponseInnerInfo(**entry) for entry in response_info.data["agencies"] ] + ) + + async def sync_data_sources( + self, + params: DataSourcesSyncParameters + ) -> DataSourcesSyncResponseInfo: + url = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=[ + "data-sources", + "sync" + ] + ) + headers = await self.access_manager.jwt_header() + headers['Content-Type'] = "application/json" + request_info = RequestInfo( + type_=RequestType.GET, + url=url, + headers=headers, + params={ + "page": params.page, + "update_at": params.cutoff_date + } + ) + response_info = await self.access_manager.make_request(request_info) + return DataSourcesSyncResponseInfo( + data_sources=[ + DataSourcesSyncResponseInnerInfo(**entry) + for entry in response_info.data["data_sources"] + ] ) \ No newline at end of file diff --git a/src/external/pdap/dtos/sync/__init__.py b/src/external/pdap/dtos/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/dtos/agencies_sync.py b/src/external/pdap/dtos/sync/agencies.py similarity index 100% rename from src/external/pdap/dtos/agencies_sync.py rename to src/external/pdap/dtos/sync/agencies.py diff --git a/src/external/pdap/dtos/sync/data_sources.py b/src/external/pdap/dtos/sync/data_sources.py new file mode 100644 index 00000000..a5fe92b9 --- /dev/null +++ b/src/external/pdap/dtos/sync/data_sources.py @@ -0,0 +1,21 @@ +from datetime import datetime + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus + + +class DataSourcesSyncResponseInnerInfo(BaseModel): + id: int + url: str + name: str + description: str | None + record_type: RecordType + agency_ids: list[int] + approval_status: ApprovalStatus + url_status: DataSourcesURLStatus + updated_at: datetime + +class DataSourcesSyncResponseInfo(BaseModel): + data_sources: list[DataSourcesSyncResponseInnerInfo] \ No newline at end of file diff --git a/src/external/pdap/enums.py b/src/external/pdap/enums.py index 36111acd..c532f820 100644 --- a/src/external/pdap/enums.py +++ b/src/external/pdap/enums.py @@ -12,3 +12,9 @@ class ApprovalStatus(Enum): REJECTED = "rejected" PENDING = "pending" NEEDS_IDENTIFICATION = "needs identification" + +class DataSourcesURLStatus(Enum): + AVAILABLE = "available" + BROKEN = "broken" + OK = "ok" + NONE_FOUND = "none found" \ No newline at end of file diff --git a/tests/alembic/conftest.py b/tests/alembic/conftest.py index 405f5677..f50dee14 100644 --- a/tests/alembic/conftest.py +++ b/tests/alembic/conftest.py @@ -3,7 +3,7 @@ from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker -from src.db.helpers import get_postgres_connection_string +from src.db.helpers.connect import get_postgres_connection_string from tests.helpers.alembic_runner import AlembicRunner diff --git a/tests/automated/integration/api/review/rejection/helpers.py b/tests/automated/integration/api/review/rejection/helpers.py index 8fb26603..2162a7b8 100644 --- a/tests/automated/integration/api/review/rejection/helpers.py +++ b/tests/automated/integration/api/review/rejection/helpers.py @@ -2,7 +2,7 @@ from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.review.reject.dto import FinalReviewRejectionInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -36,4 +36,4 @@ async def run_rejection_test( assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.outcome == url_status.value + assert url.outcome == url_status diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index 9afc16d8..780484cc 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -5,9 +5,9 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -54,8 +54,8 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.outcome == URLStatus.VALIDATED.value + assert url.record_type == RecordType.ARREST_RECORDS + assert url.outcome == URLStatus.VALIDATED assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -66,7 +66,7 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert optional_metadata[0].record_formats == ["New Test Record Format", "New Test Record Format 2"] # Get agencies - confirmed_agencies = await adb_client.get_all(ConfirmedURLAgency) + confirmed_agencies = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agencies) == 4 for agency in confirmed_agencies: assert agency.agency_id in agency_ids diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py index eea90bf2..07408ff0 100644 --- a/tests/automated/integration/api/test_batch.py +++ b/tests/automated/integration/api/test_batch.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType, URLStatus diff --git a/tests/automated/integration/api/test_example_collector.py b/tests/automated/integration/api/test_example_collector.py index 1e20362d..2903c528 100644 --- a/tests/automated/integration/api/test_example_collector.py +++ b/tests/automated/integration/api/test_example_collector.py @@ -7,7 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO from src.collectors.source_collectors.example.core import ExampleCollector from src.collectors.enums import CollectorType diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index a7be37e4..bdf858f7 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,10 +2,10 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.instantiations.link.link_batch_urls import LinkBatchURL +from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.batch import Batch +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType from src.core.enums import RecordType diff --git a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py index 33a93998..37ed6462 100644 --- a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py +++ b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py @@ -1,7 +1,7 @@ import pytest from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 590f9cd1..df783e84 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,8 +3,8 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.confirmed_url_agency import ConfirmedURLAgency -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -41,12 +41,12 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS.value - assert url.outcome == URLStatus.VALIDATED.value + assert url.record_type == RecordType.ARREST_RECORDS + assert url.outcome == URLStatus.VALIDATED assert url.name == "Test Name" assert url.description == "Test Description" - confirmed_agency: list[ConfirmedURLAgency] = await adb_client.get_all(ConfirmedURLAgency) + confirmed_agency: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id assert confirmed_agency[0].agency_id == agency_id diff --git a/tests/automated/integration/db/client/test_add_url_error_info.py b/tests/automated/integration/db/client/test_add_url_error_info.py index 34d103ce..3bb25e58 100644 --- a/tests/automated/integration/db/client/test_add_url_error_info.py +++ b/tests/automated/integration/db/client/test_add_url_error_info.py @@ -1,7 +1,7 @@ import pytest from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_old_logs.py b/tests/automated/integration/db/client/test_delete_old_logs.py index d451af8f..1a5b0cd7 100644 --- a/tests/automated/integration/db/client/test_delete_old_logs.py +++ b/tests/automated/integration/db/client/test_delete_old_logs.py @@ -2,7 +2,7 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index a6ca731b..34bbc7b3 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_logs.py b/tests/automated/integration/db/client/test_insert_logs.py index d752c894..6da198d8 100644 --- a/tests/automated/integration/db/client/test_insert_logs.py +++ b/tests/automated/integration/db/client/test_insert_logs.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 73a88d02..a9aaf1fe 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,8 +1,8 @@ import pytest from src.core.enums import BatchStatus -from src.db.dtos.batch import BatchInfo -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo @pytest.mark.asyncio diff --git a/tests/automated/integration/db/structure/README.md b/tests/automated/integration/db/structure/README.md new file mode 100644 index 00000000..2e22a324 --- /dev/null +++ b/tests/automated/integration/db/structure/README.md @@ -0,0 +1,6 @@ +Database Structure tests, in this instance +Test the integrity of the database schema and that it behaves as expected. + +This includes testing that: +* Enum columns allow only allowed values (and throw errors on others) +* Column types are correct diff --git a/tests/automated/integration/db/structure/__init__.py b/tests/automated/integration/db/structure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/test_batch.py b/tests/automated/integration/db/structure/test_batch.py new file mode 100644 index 00000000..f905b178 --- /dev/null +++ b/tests/automated/integration/db/structure/test_batch.py @@ -0,0 +1,88 @@ +import sqlalchemy as sa +from sqlalchemy import create_engine +from sqlalchemy.dialects import postgresql + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.helpers.connect import get_postgres_connection_string +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester + + +def test_batch(wiped_database): + engine = create_engine(get_postgres_connection_string()) + table_tester = TableTester( + table_name="batches", + columns=[ + ColumnTester( + column_name="strategy", + type_=postgresql.ENUM, + allowed_values=get_enum_values(CollectorType), + ), + ColumnTester( + column_name="user_id", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="status", + type_=postgresql.ENUM, + allowed_values=get_enum_values(BatchStatus), + ), + ColumnTester( + column_name="total_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="original_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="duplicate_url_count", + type_=sa.Integer, + allowed_values=[1], + ), + ColumnTester( + column_name="strategy_success_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="metadata_success_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="agency_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="record_type_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="record_category_match_rate", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="compute_time", + type_=sa.Float, + allowed_values=[1.0], + ), + ColumnTester( + column_name="parameters", + type_=sa.JSON, + allowed_values=[{}] + ) + + ], + engine=engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_html_content.py b/tests/automated/integration/db/structure/test_html_content.py new file mode 100644 index 00000000..8c9c3207 --- /dev/null +++ b/tests/automated/integration/db/structure/test_html_content.py @@ -0,0 +1,38 @@ +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.enums import URLHTMLContentType +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_html_content(db_data_creator: DBDataCreator): + batch_id = db_data_creator.batch() + iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) + + table_tester = TableTester( + table_name="url_html_content", + columns=[ + ColumnTester( + column_name="url_id", + type_=sa.Integer, + allowed_values=[iui.url_mappings[0].url_id] + ), + ColumnTester( + column_name="content_type", + type_=postgresql.ENUM, + allowed_values=get_enum_values(URLHTMLContentType) + ), + ColumnTester( + column_name="content", + type_=sa.Text, + allowed_values=["Text"] + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_root_url.py b/tests/automated/integration/db/structure/test_root_url.py new file mode 100644 index 00000000..7c3712df --- /dev/null +++ b/tests/automated/integration/db/structure/test_root_url.py @@ -0,0 +1,32 @@ +import sqlalchemy as sa + +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_root_url(db_data_creator: DBDataCreator): + + table_tester = TableTester( + table_name="root_urls", + columns=[ + ColumnTester( + column_name="url", + type_=sa.String, + allowed_values=["https://example.com"] + ), + ColumnTester( + column_name="page_title", + type_=sa.String, + allowed_values=["Text"] + ), + ColumnTester( + column_name="page_description", + type_=sa.String, + allowed_values=["Text"] + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/test_upsert_new_agencies.py b/tests/automated/integration/db/structure/test_upsert_new_agencies.py new file mode 100644 index 00000000..17a184f4 --- /dev/null +++ b/tests/automated/integration/db/structure/test_upsert_new_agencies.py @@ -0,0 +1,59 @@ +import pytest + +from src.core.enums import SuggestionType +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.helpers.db_data_creator import DBDataCreator + + +@pytest.mark.asyncio +async def test_upsert_new_agencies( + wiped_database, + db_data_creator: DBDataCreator +): + """ + Check that if the agency doesn't exist, it is added + But if the agency does exist, it is updated with new information + """ + + suggestions = [] + for i in range(3): + suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=i, + agency_name=f"Test Agency {i}", + state=f"Test State {i}", + county=f"Test County {i}", + locality=f"Test Locality {i}", + user_id=1 + ) + suggestions.append(suggestion) + + adb_client = db_data_creator.adb_client + await adb_client.upsert_new_agencies(suggestions) + + update_suggestion = URLAgencySuggestionInfo( + url_id=1, + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=0, + agency_name="Updated Test Agency", + state="Updated Test State", + county="Updated Test County", + locality="Updated Test Locality", + user_id=1 + ) + + await adb_client.upsert_new_agencies([update_suggestion]) + + rows = await adb_client.get_all(Agency, order_by_attribute="agency_id") + + assert len(rows) == 3 + + d = {} + for row in rows: + d[row.agency_id] = row.name + + assert d[0] == "Updated Test Agency" + assert d[1] == "Test Agency 1" + assert d[2] == "Test Agency 2" diff --git a/tests/automated/integration/db/structure/test_url.py b/tests/automated/integration/db/structure/test_url.py new file mode 100644 index 00000000..c9c3cf79 --- /dev/null +++ b/tests/automated/integration/db/structure/test_url.py @@ -0,0 +1,45 @@ +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from src.collectors.enums import URLStatus +from src.util.helper_functions import get_enum_values +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.testers.table import TableTester +from tests.helpers.db_data_creator import DBDataCreator + + +def test_url(db_data_creator: DBDataCreator): + batch_id = db_data_creator.batch() + table_tester = TableTester( + table_name="urls", + columns=[ + ColumnTester( + column_name="batch_id", + type_=sa.Integer, + allowed_values=[batch_id], + ), + ColumnTester( + column_name="url", + type_=sa.String, + allowed_values=["https://example.com"], + ), + ColumnTester( + column_name="collector_metadata", + type_=sa.JSON, + allowed_values=[{}] + ), + ColumnTester( + column_name="outcome", + type_=postgresql.ENUM, + allowed_values=get_enum_values(URLStatus) + ), + ColumnTester( + column_name="name", + type_=sa.String, + allowed_values=['test'], + ) + ], + engine=db_data_creator.db_client.engine + ) + + table_tester.run_column_tests() diff --git a/tests/automated/integration/db/structure/testers/__init__.py b/tests/automated/integration/db/structure/testers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/testers/models/__init__.py b/tests/automated/integration/db/structure/testers/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/testers/models/column.py b/tests/automated/integration/db/structure/testers/models/column.py new file mode 100644 index 00000000..1b4c5a50 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/column.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + +from tests.automated.integration.db.structure.types import SATypes + + +@dataclass +class ColumnTester: + column_name: str + type_: SATypes + allowed_values: list diff --git a/tests/automated/integration/db/structure/testers/models/foreign_key.py b/tests/automated/integration/db/structure/testers/models/foreign_key.py new file mode 100644 index 00000000..517a82a8 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/foreign_key.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + + +@dataclass +class ForeignKeyTester: + column_name: str + valid_id: int + invalid_id: int diff --git a/tests/automated/integration/db/structure/testers/models/unique_constraint.py b/tests/automated/integration/db/structure/testers/models/unique_constraint.py new file mode 100644 index 00000000..baa85cbb --- /dev/null +++ b/tests/automated/integration/db/structure/testers/models/unique_constraint.py @@ -0,0 +1,6 @@ +from dataclasses import dataclass + + +@dataclass +class UniqueConstraintTester: + columns: list[str] diff --git a/tests/automated/integration/db/structure/testers/table.py b/tests/automated/integration/db/structure/testers/table.py new file mode 100644 index 00000000..aed5d3a5 --- /dev/null +++ b/tests/automated/integration/db/structure/testers/table.py @@ -0,0 +1,95 @@ +from typing import Optional, Any + +import pytest +import sqlalchemy as sa +from sqlalchemy import create_engine +from sqlalchemy.dialects import postgresql +from sqlalchemy.exc import DataError + +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.templates import Base +from tests.automated.integration.db.structure.testers.models.column import ColumnTester +from tests.automated.integration.db.structure.types import ConstraintTester, SATypes + + +class TableTester: + + def __init__( + self, + columns: list[ColumnTester], + table_name: str, + engine: Optional[sa.Engine] = None, + constraints: Optional[list[ConstraintTester]] = None, + ): + if engine is None: + engine = create_engine(get_postgres_connection_string(is_async=True)) + self.columns = columns + self.table_name = table_name + self.constraints = constraints + self.engine = engine + + def run_tests(self): + pass + + def setup_row_dict(self, override: Optional[dict[str, Any]] = None): + d = {} + for column in self.columns: + # For row dicts, the first value is the default + d[column.column_name] = column.allowed_values[0] + if override is not None: + d.update(override) + return d + + def run_column_test(self, column: ColumnTester): + if len(column.allowed_values) == 1: + return # It will be tested elsewhere + for value in column.allowed_values: + print(f"Testing column {column.column_name} with value {value}") + row_dict = self.setup_row_dict(override={column.column_name: value}) + table = self.get_table_model() + with self.engine.begin() as conn: + # Delete existing rows + conn.execute(table.delete()) + conn.commit() + with self.engine.begin() as conn: + conn.execute(table.insert(), row_dict) + conn.commit() + conn.close() + self.test_invalid_values(column) + + def generate_invalid_value(self, type_: SATypes): + match type_: + case sa.Integer: + return "not an integer" + case sa.String: + return -1 + case postgresql.ENUM: + return "not an enum value" + case sa.TIMESTAMP: + return "not a timestamp" + + def test_invalid_values(self, column: ColumnTester): + invalid_value = self.generate_invalid_value(type_=column.type_) + row_dict = self.setup_row_dict(override={column.column_name: invalid_value}) + table = self.get_table_model() + print(f"Testing column '{column.column_name}' with invalid value {invalid_value}") + with pytest.raises(DataError): + with self.engine.begin() as conn: + conn.execute(table.delete()) + conn.commit() + with self.engine.begin() as conn: + conn.execute(table.insert(), row_dict) + conn.commit() + conn.close() + + + def get_table_model(self) -> sa.Table: + """ + Retrieve table model from metadata + """ + return sa.Table(self.table_name, Base.metadata, autoload_with=self.engine) + + + def run_column_tests(self): + for column in self.columns: + self.run_column_test(column) diff --git a/tests/automated/integration/db/structure/types.py b/tests/automated/integration/db/structure/types.py new file mode 100644 index 00000000..3124538f --- /dev/null +++ b/tests/automated/integration/db/structure/types.py @@ -0,0 +1,10 @@ +from typing import TypeAlias + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from tests.automated.integration.db.structure.testers.models.foreign_key import ForeignKeyTester +from tests.automated.integration.db.structure.testers.models.unique_constraint import UniqueConstraintTester + +SATypes: TypeAlias = sa.Integer or sa.String or postgresql.ENUM or sa.TIMESTAMP or sa.Text +ConstraintTester: TypeAlias = UniqueConstraintTester or ForeignKeyTester diff --git a/tests/automated/integration/db/test_change_log.py b/tests/automated/integration/db/test_change_log.py new file mode 100644 index 00000000..dde2d702 --- /dev/null +++ b/tests/automated/integration/db/test_change_log.py @@ -0,0 +1,96 @@ +import pytest +from sqlalchemy import update, delete + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import ChangeLogOperationType +from src.db.models.instantiations.change_log import ChangeLog +from src.db.models.instantiations.url.core.sqlalchemy import URL + + +class _TestChangeGetter: + + def __init__(self, adb: AsyncDatabaseClient): + self.adb = adb + + async def get_change_log_entries(self): + return await self.adb.get_all(ChangeLog) + +@pytest.mark.asyncio +async def test_change_log(wiped_database, adb_client_test: AsyncDatabaseClient): + getter = _TestChangeGetter(adb_client_test) + + # Confirm no entries in the change log table + entries = await getter.get_change_log_entries() + assert len(entries) == 0 + + # Add entry to URL table + url = URL( + url="test_url", + name="test_name", + description="test_description", + outcome='pending' + ) + url_id = await adb_client_test.add(url, return_id=True) + + # Choose a single logged table -- URL -- for testing + entries = await getter.get_change_log_entries() + assert len(entries) == 1 + entry: ChangeLog = entries[0] + assert entry.operation_type == ChangeLogOperationType.INSERT + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is None + assert entry.new_data is not None + nd = entry.new_data + assert nd["id"] == url_id + assert nd["url"] == "test_url" + assert nd["name"] == "test_name" + assert nd["description"] == "test_description" + assert nd["outcome"] == "pending" + assert nd["created_at"] is not None + assert nd["updated_at"] is not None + assert nd['record_type'] is None + assert nd['collector_metadata'] is None + + # Update URL + + await adb_client_test.execute( + update(URL).where(URL.id == url_id).values( + name="new_name", + description="new_description" + ) + ) + + # Confirm change log entry + entries = await getter.get_change_log_entries() + assert len(entries) == 2 + entry: ChangeLog = entries[1] + assert entry.operation_type == ChangeLogOperationType.UPDATE + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is not None + assert entry.new_data is not None + od = entry.old_data + nd = entry.new_data + assert nd['description'] == "new_description" + assert od['description'] == "test_description" + assert nd['name'] == "new_name" + assert od['name'] == "test_name" + assert nd['updated_at'] is not None + assert od['updated_at'] is not None + + # Delete URL + await adb_client_test.execute( + delete(URL).where(URL.id == url_id) + ) + + # Confirm change log entry + entries = await getter.get_change_log_entries() + assert len(entries) == 3 + entry: ChangeLog = entries[2] + assert entry.operation_type == ChangeLogOperationType.DELETE + assert entry.table_name == "urls" + assert entry.affected_id == url_id + assert entry.old_data is not None + assert entry.new_data is None + diff --git a/tests/automated/integration/db/test_database_structure.py b/tests/automated/integration/db/test_database_structure.py deleted file mode 100644 index 7b34cebb..00000000 --- a/tests/automated/integration/db/test_database_structure.py +++ /dev/null @@ -1,348 +0,0 @@ -""" -Database Structure tests, in this instance -Test the integrity of the database schema and that it behaves as expected. - -This includes testing that: -* Enum columns allow only allowed values (and throw errors on others) -* Column types are correct -""" - -from dataclasses import dataclass -from typing import TypeAlias, Optional, Any - -import pytest -import sqlalchemy as sa -from sqlalchemy import create_engine -from sqlalchemy.dialects import postgresql -from sqlalchemy.exc import DataError - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.enums import URLHTMLContentType -from src.db.helpers import get_postgres_connection_string -from src.db.models.instantiations.agency import Agency -from src.collectors.enums import CollectorType, URLStatus -from src.core.enums import BatchStatus, SuggestionType -from src.db.models.templates import Base -from src.util.helper_functions import get_enum_values -from tests.helpers.db_data_creator import DBDataCreator - -SATypes: TypeAlias = sa.Integer or sa.String or postgresql.ENUM or sa.TIMESTAMP or sa.Text - -@dataclass -class ColumnTester: - column_name: str - type_: SATypes - allowed_values: list - -@dataclass -class UniqueConstraintTester: - columns: list[str] - -@dataclass -class ForeignKeyTester: - column_name: str - valid_id: int - invalid_id: int - -ConstraintTester: TypeAlias = UniqueConstraintTester or ForeignKeyTester - -class TableTester: - - def __init__( - self, - columns: list[ColumnTester], - table_name: str, - engine: Optional[sa.Engine] = None, - constraints: Optional[list[ConstraintTester]] = None, - ): - if engine is None: - engine = create_engine(get_postgres_connection_string(is_async=True)) - self.columns = columns - self.table_name = table_name - self.constraints = constraints - self.engine = engine - - def run_tests(self): - pass - - def setup_row_dict(self, override: Optional[dict[str, Any]] = None): - d = {} - for column in self.columns: - # For row dicts, the first value is the default - d[column.column_name] = column.allowed_values[0] - if override is not None: - d.update(override) - return d - - def run_column_test(self, column: ColumnTester): - if len(column.allowed_values) == 1: - return # It will be tested elsewhere - for value in column.allowed_values: - print(f"Testing column {column.column_name} with value {value}") - row_dict = self.setup_row_dict(override={column.column_name: value}) - table = self.get_table_model() - with self.engine.begin() as conn: - # Delete existing rows - conn.execute(table.delete()) - conn.commit() - with self.engine.begin() as conn: - conn.execute(table.insert(), row_dict) - conn.commit() - conn.close() - self.test_invalid_values(column) - - def generate_invalid_value(self, type_: SATypes): - match type_: - case sa.Integer: - return "not an integer" - case sa.String: - return -1 - case postgresql.ENUM: - return "not an enum value" - case sa.TIMESTAMP: - return "not a timestamp" - - def test_invalid_values(self, column: ColumnTester): - invalid_value = self.generate_invalid_value(type_=column.type_) - row_dict = self.setup_row_dict(override={column.column_name: invalid_value}) - table = self.get_table_model() - print(f"Testing column '{column.column_name}' with invalid value {invalid_value}") - with pytest.raises(DataError): - with self.engine.begin() as conn: - conn.execute(table.delete()) - conn.commit() - with self.engine.begin() as conn: - conn.execute(table.insert(), row_dict) - conn.commit() - conn.close() - - - def get_table_model(self) -> sa.Table: - """ - Retrieve table model from metadata - """ - return sa.Table(self.table_name, Base.metadata, autoload_with=self.engine) - - - def run_column_tests(self): - for column in self.columns: - self.run_column_test(column) - - -def test_batch(wiped_database): - engine = create_engine(get_postgres_connection_string()) - table_tester = TableTester( - table_name="batches", - columns=[ - ColumnTester( - column_name="strategy", - type_=postgresql.ENUM, - allowed_values=get_enum_values(CollectorType), - ), - ColumnTester( - column_name="user_id", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="status", - type_=postgresql.ENUM, - allowed_values=get_enum_values(BatchStatus), - ), - ColumnTester( - column_name="total_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="original_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="duplicate_url_count", - type_=sa.Integer, - allowed_values=[1], - ), - ColumnTester( - column_name="strategy_success_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="metadata_success_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="agency_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="record_type_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="record_category_match_rate", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="compute_time", - type_=sa.Float, - allowed_values=[1.0], - ), - ColumnTester( - column_name="parameters", - type_=sa.JSON, - allowed_values=[{}] - ) - - ], - engine=engine - ) - - table_tester.run_column_tests() - -def test_url(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - table_tester = TableTester( - table_name="urls", - columns=[ - ColumnTester( - column_name="batch_id", - type_=sa.Integer, - allowed_values=[batch_id], - ), - ColumnTester( - column_name="url", - type_=sa.String, - allowed_values=["https://example.com"], - ), - ColumnTester( - column_name="collector_metadata", - type_=sa.JSON, - allowed_values=[{}] - ), - ColumnTester( - column_name="outcome", - type_=postgresql.ENUM, - allowed_values=get_enum_values(URLStatus) - ), - ColumnTester( - column_name="name", - type_=sa.String, - allowed_values=['test'], - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - -def test_html_content(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - iui: InsertURLsInfo = db_data_creator.urls(batch_id=batch_id, url_count=1) - - table_tester = TableTester( - table_name="url_html_content", - columns=[ - ColumnTester( - column_name="url_id", - type_=sa.Integer, - allowed_values=[iui.url_mappings[0].url_id] - ), - ColumnTester( - column_name="content_type", - type_=postgresql.ENUM, - allowed_values=get_enum_values(URLHTMLContentType) - ), - ColumnTester( - column_name="content", - type_=sa.Text, - allowed_values=["Text"] - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - -def test_root_url(db_data_creator: DBDataCreator): - - table_tester = TableTester( - table_name="root_urls", - columns=[ - ColumnTester( - column_name="url", - type_=sa.String, - allowed_values=["https://example.com"] - ), - ColumnTester( - column_name="page_title", - type_=sa.String, - allowed_values=["Text"] - ), - ColumnTester( - column_name="page_description", - type_=sa.String, - allowed_values=["Text"] - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() - - -@pytest.mark.asyncio -async def test_upsert_new_agencies(db_data_creator: DBDataCreator): - """ - Check that if the agency doesn't exist, it is added - But if the agency does exist, it is updated with new information - """ - - suggestions = [] - for i in range(3): - suggestion = URLAgencySuggestionInfo( - url_id=1, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=i, - agency_name=f"Test Agency {i}", - state=f"Test State {i}", - county=f"Test County {i}", - locality=f"Test Locality {i}", - user_id=1 - ) - suggestions.append(suggestion) - - adb_client = db_data_creator.adb_client - await adb_client.upsert_new_agencies(suggestions) - - update_suggestion = URLAgencySuggestionInfo( - url_id=1, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=0, - agency_name="Updated Test Agency", - state="Updated Test State", - county="Updated Test County", - locality="Updated Test Locality", - user_id=1 - ) - - await adb_client.upsert_new_agencies([update_suggestion]) - - rows = await adb_client.get_all(Agency) - - assert len(rows) == 3 - - d = {} - for row in rows: - d[row.agency_id] = row.name - - assert d[0] == "Updated Test Agency" - assert d[1] == "Test Agency 1" - assert d[2] == "Test Agency 2" diff --git a/tests/automated/integration/tasks/scheduled/sync/__init__.py b/tests/automated/integration/tasks/scheduled/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/agency/__init__.py b/tests/automated/integration/tasks/scheduled/sync/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/conftest.py b/tests/automated/integration/tasks/scheduled/sync/agency/conftest.py similarity index 74% rename from tests/automated/integration/tasks/scheduled/agency_sync/conftest.py rename to tests/automated/integration/tasks/scheduled/sync/agency/conftest.py index b621250f..8ba4221f 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/conftest.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/conftest.py @@ -1,7 +1,7 @@ import pytest_asyncio -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import update_existing_agencies_updated_at, \ +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import update_existing_agencies_updated_at, \ add_existing_agencies @pytest_asyncio.fixture diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/data.py b/tests/automated/integration/tasks/scheduled/sync/agency/data.py similarity index 97% rename from tests/automated/integration/tasks/scheduled/agency_sync/data.py rename to tests/automated/integration/tasks/scheduled/sync/agency/data.py index fa06ea33..d3227393 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/data.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/data.py @@ -1,6 +1,6 @@ from datetime import datetime -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo PREEXISTING_AGENCY_1 = AgenciesSyncResponseInnerInfo( display_name="Preexisting Agency 1", diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py similarity index 80% rename from tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py rename to tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py index 150df5b0..292f4aea 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/existence_checker.py @@ -1,6 +1,6 @@ -from src.db.models.instantiations.agency import Agency -from src.external.pdap.dtos.agencies_sync import AgenciesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.agency_sync.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.agency.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE class AgencyChecker: diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/helpers.py b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py similarity index 77% rename from tests/automated/integration/tasks/scheduled/agency_sync/helpers.py rename to tests/automated/integration/tasks/scheduled/sync/agency/helpers.py index c05e61f7..a60f0586 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/helpers.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/helpers.py @@ -2,13 +2,13 @@ from datetime import timedelta from unittest.mock import patch -from sqlalchemy import select, func, TIMESTAMP, cast +from sqlalchemy import select, func, TIMESTAMP, cast, update from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.scheduled.agency_sync.data import PREEXISTING_AGENCIES +from tests.automated.integration.tasks.scheduled.sync.agency.data import PREEXISTING_AGENCIES async def check_sync_concluded( @@ -45,18 +45,13 @@ async def check_sync_concluded( async def update_existing_agencies_updated_at(db_data_creator): - update_mappings = [] for preexisting_agency in PREEXISTING_AGENCIES: - update_mapping = { - "agency_id": preexisting_agency.agency_id, - "updated_at": preexisting_agency.updated_at - } - update_mappings.append(update_mapping) - await db_data_creator.adb_client.bulk_update( - model=Agency, - mappings=update_mappings, - ) - + query = ( + update(Agency) + .where(Agency.agency_id == preexisting_agency.agency_id) + .values(updated_at=preexisting_agency.updated_at) + ) + await db_data_creator.adb_client.execute(query) async def add_existing_agencies(db_data_creator): agencies_to_add = [] diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py similarity index 77% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py index 863acf5c..02cefa3e 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_happy_path.py @@ -3,17 +3,18 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.db.models.instantiations.agency import Agency -from tests.automated.integration.tasks.scheduled.agency_sync.data import AGENCIES_SYNC_RESPONSES -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import check_sync_concluded, patch_sync_agencies +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.automated.integration.tasks.scheduled.sync.agency.data import AGENCIES_SYNC_RESPONSES +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import check_sync_concluded, patch_sync_agencies from tests.helpers.asserts import assert_task_run_success @pytest.mark.asyncio async def test_agency_sync_happy_path( + wiped_database, setup: SyncAgenciesTaskOperator ): operator = setup diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py similarity index 84% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py index f11e4e1f..41f4b86c 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_interruption.py @@ -1,14 +1,14 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.agency_sync.data import FIRST_CALL_RESPONSE, \ +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from tests.automated.integration.tasks.scheduled.sync.agency.data import FIRST_CALL_RESPONSE, \ THIRD_CALL_RESPONSE, SECOND_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import patch_sync_agencies, check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import patch_sync_agencies, check_sync_concluded @pytest.mark.asyncio diff --git a/tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py similarity index 74% rename from tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py rename to tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py index fcc353ef..20a179bd 100644 --- a/tests/automated/integration/tasks/scheduled/agency_sync/test_no_new_results.py +++ b/tests/automated/integration/tasks/scheduled/sync/agency/test_no_new_results.py @@ -4,13 +4,13 @@ import pytest from sqlalchemy import select -from src.core.tasks.scheduled.operators.agency_sync.core import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters -from src.db.models.instantiations.agency import Agency -from src.db.models.instantiations.sync_state_agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.agency_sync.data import THIRD_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.agency_sync.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.agency_sync.helpers import patch_sync_agencies, check_sync_concluded +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.instantiations.sync_state.agencies import AgenciesSyncState +from tests.automated.integration.tasks.scheduled.sync.agency.data import THIRD_CALL_RESPONSE +from tests.automated.integration.tasks.scheduled.sync.agency.existence_checker import AgencyChecker +from tests.automated.integration.tasks.scheduled.sync.agency.helpers import patch_sync_agencies, check_sync_concluded from tests.helpers.asserts import assert_task_run_success diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py new file mode 100644 index 00000000..5968831f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/check.py @@ -0,0 +1,41 @@ +from datetime import timedelta + +from sqlalchemy import select, cast, func, TIMESTAMP + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from src.db.models.instantiations.url.core.sqlalchemy import URL + + +async def check_sync_concluded( + db_client: AsyncDatabaseClient, + check_updated_at: bool = True +): + + current_db_datetime = await db_client.scalar( + select( + cast(func.now(), TIMESTAMP) + ) + ) + + sync_state_results = await db_client.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at > current_db_datetime - timedelta(minutes=5) + assert sync_state_results.current_cutoff_date > (current_db_datetime - timedelta(days=2)).date() + + if not check_updated_at: + return + + updated_ats = await db_client.scalars( + select( + URL.updated_at + ) + ) + assert all( + updated_at > current_db_datetime - timedelta(minutes=5) + for updated_at in updated_ats + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py new file mode 100644 index 00000000..470504ab --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/conftest.py @@ -0,0 +1,16 @@ +import pytest_asyncio + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.external.pdap.client import PDAPClient +from tests.helpers.db_data_creator import DBDataCreator + + +@pytest_asyncio.fixture +async def test_operator( + db_data_creator: DBDataCreator, + mock_pdap_client: PDAPClient +) -> SyncDataSourcesTaskOperator: + return SyncDataSourcesTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py new file mode 100644 index 00000000..d034def8 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/existence_checker.py @@ -0,0 +1,42 @@ +from collections import defaultdict + +from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo + + +class URLExistenceChecker: + + def __init__( + self, + responses: list[DataSourcesSyncResponseInfo], + url_ds_links: list[URLDataSource], + url_agency_links: list[LinkURLAgency] + ): + self._ds_id_response_dict: dict[int, DataSourcesSyncResponseInnerInfo] = {} + for response in responses: + for data_source in response.data_sources: + self._ds_id_response_dict[data_source.id] = data_source + self._ds_id_url_link_dict = {} + for link in url_ds_links: + self._ds_id_url_link_dict[link.data_source_id] = link.url_id + self._url_id_agency_link_dict = defaultdict(list) + for link in url_agency_links: + self._url_id_agency_link_dict[link.url_id].append(link.agency_id) + + + def check(self, url: URL): + ds_id = self._ds_id_url_link_dict.get(url.id) + if ds_id is None: + raise AssertionError(f"URL {url.id} has no data source link") + response = self._ds_id_response_dict.get(ds_id) + if response is None: + raise AssertionError(f"Data source {ds_id} has no response") + + assert response.url == url.url + assert response.description == url.description + assert response.name == url.name + + agency_ids = self._url_id_agency_link_dict.get(url.id) + assert set(response.agency_ids) == set(agency_ids) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py new file mode 100644 index 00000000..932d2518 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/core.py @@ -0,0 +1,14 @@ +from contextlib import contextmanager +from unittest.mock import patch + +from src.external.pdap.client import PDAPClient + + +@contextmanager +def patch_sync_data_sources(side_effects: list): + with patch.object( + PDAPClient, + "sync_data_sources", + side_effect=side_effects + ): + yield \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py new file mode 100644 index 00000000..787a60f0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/data.py @@ -0,0 +1,100 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder, AgencyAssigned +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import TestSCURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry + +ENTRIES = [ + TestURLSetupEntry( + # A URL in both DBs that should be overwritten + url='https://example.com/1', + ds_info=TestDSURLSetupEntry( + id=100, + name='Overwritten URL 1 Name', + description='Overwritten URL 1 Description', + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.ACCIDENT_REPORTS, + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.TWO], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 1 Name', + description='Pre-existing URL 1 Description', + record_type=RecordType.ACCIDENT_REPORTS, + url_status=URLStatus.PENDING, + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] + ), + final_url_status=URLStatus.VALIDATED + ), + TestURLSetupEntry( + # A DS-only approved but broken URL + url='https://example.com/2', + ds_info=TestDSURLSetupEntry( + id=101, + name='New URL 2 Name', + description='New URL 2 Description', + url_status=DataSourcesURLStatus.BROKEN, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.INCARCERATION_RECORDS, + agencies_assigned=[AgencyAssigned.TWO], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=None, + final_url_status=URLStatus.NOT_FOUND + ), + TestURLSetupEntry( + # An SC-only pending URL, should be unchanged. + url='https://example.com/3', + ds_info=None, + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 3 Name', + description='Pre-existing URL 3 Description', + record_type=RecordType.FIELD_CONTACTS, + url_status=URLStatus.PENDING, + agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] + ), + final_url_status=URLStatus.PENDING + ), + TestURLSetupEntry( + # A DS-only rejected URL + url='https://example.com/4', + ds_info=TestDSURLSetupEntry( + id=102, + name='New URL 4 Name', + description=None, + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.REJECTED, + record_type=RecordType.ACCIDENT_REPORTS, + agencies_assigned=[AgencyAssigned.ONE], + sync_response_order=SyncResponseOrder.FIRST + ), + sc_info=None, + final_url_status=URLStatus.NOT_RELEVANT + ), + TestURLSetupEntry( + # A pre-existing URL in the second response + url='https://example.com/5', + ds_info=TestDSURLSetupEntry( + id=103, + name='New URL 5 Name', + description=None, + url_status=DataSourcesURLStatus.OK, + approval_status=ApprovalStatus.APPROVED, + record_type=RecordType.INCARCERATION_RECORDS, + agencies_assigned=[AgencyAssigned.ONE], + sync_response_order=SyncResponseOrder.SECOND + ), + sc_info=TestSCURLSetupEntry( + name='Pre-existing URL 5 Name', + description='Pre-existing URL 5 Description', + record_type=None, + url_status=URLStatus.PENDING, + agencies_assigned=[] + ), + final_url_status=URLStatus.VALIDATED + ) +] + diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py new file mode 100644 index 00000000..fd1e1da2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/enums.py @@ -0,0 +1,16 @@ +from enum import Enum + + +class SyncResponseOrder(Enum): + """Represents which sync response the entry is in.""" + FIRST = 1 + SECOND = 2 + # No entries should be in 3 + THIRD = 3 + + +class AgencyAssigned(Enum): + """Represents which of several pre-created agencies the entry is assigned to.""" + ONE = 1 + TWO = 2 + THREE = 3 diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py new file mode 100644 index 00000000..f7fd5765 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/agency.py @@ -0,0 +1,31 @@ +from sqlalchemy import select + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.agency.sqlalchemy import Agency +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned + + +class AgencyAssignmentManager: + + def __init__(self, adb_client: AsyncDatabaseClient): + self.adb_client = adb_client + self._dict: dict[AgencyAssigned, int] = {} + + async def setup(self): + agencies = [] + for ag_enum in AgencyAssigned: + agency = Agency( + agency_id=ag_enum.value, + name=f"Test Agency {ag_enum.name}", + state="test_state", + county="test_county", + locality="test_locality" + ) + agencies.append(agency) + await self.adb_client.add_all(agencies) + agency_ids = await self.adb_client.scalars(select(Agency.agency_id)) + for ag_enum, agency_id in zip(AgencyAssigned, agency_ids): + self._dict[ag_enum] = agency_id + + async def get(self, ag_enum: AgencyAssigned) -> int: + return self._dict[ag_enum] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py new file mode 100644 index 00000000..79f44f88 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/core.py @@ -0,0 +1,111 @@ +from collections import defaultdict + +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo, DataSourcesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.agency import AgencyAssignmentManager +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.queries.check import \ + CheckURLQueryBuilder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.url import URLSetupFunctor +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord + + +class DataSourcesSyncTestSetupManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + entries: list[TestURLSetupEntry], + ): + self.adb_client = adb_client + self.entries = entries + self.agency_assignment_manager = AgencyAssignmentManager(self.adb_client) + + self.url_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} + self.ds_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} + self.sync_response_order_to_setup_record: dict[ + SyncResponseOrder, list[TestURLPostSetupRecord] + ] = defaultdict(list) + + self.response_dict: dict[ + SyncResponseOrder, list[DataSourcesSyncResponseInnerInfo] + ] = defaultdict(list) + + async def setup(self): + await self.setup_agencies() + await self.setup_entries() + + async def setup_entries(self): + for entry in self.entries: + await self.setup_entry(entry) + + async def setup_entry( + self, + entry: TestURLSetupEntry + ) -> None: + """ + Modifies: + self.url_id_to_setup_record + self.ds_id_to_setup_record + self.response_dict + """ + functor = URLSetupFunctor( + entry=entry, + agency_assignment_manager=self.agency_assignment_manager, + adb_client=self.adb_client + ) + result = await functor() + response_info = result.ds_response_info + if response_info is not None: + self.response_dict[entry.ds_info.sync_response_order].append(response_info) + if result.url_id is not None: + self.url_id_to_setup_record[result.url_id] = result + if result.data_sources_id is not None: + self.ds_id_to_setup_record[result.data_sources_id] = result + if entry.ds_info is not None: + self.sync_response_order_to_setup_record[ + entry.ds_info.sync_response_order + ].append(result) + + async def setup_agencies(self): + await self.agency_assignment_manager.setup() + + async def get_data_sources_sync_responses( + self, + orders: list[SyncResponseOrder | ValueError] + ) -> list[DataSourcesSyncResponseInfo]: + results = [] + for order in orders: + results.append( + DataSourcesSyncResponseInfo( + data_sources=self.response_dict[order] + ) + ) + return results + + async def check_via_url(self, url_id: int): + builder = CheckURLQueryBuilder( + record=self.url_id_to_setup_record[url_id] + ) + await self.adb_client.run_query_builder(builder) + + async def check_via_data_source(self, data_source_id: int): + builder = CheckURLQueryBuilder( + record=self.ds_id_to_setup_record[data_source_id] + ) + await self.adb_client.run_query_builder(builder) + + async def check_results(self): + for url_id in self.url_id_to_setup_record.keys(): + await self.check_via_url(url_id) + for data_source_id in self.ds_id_to_setup_record.keys(): + await self.check_via_data_source(data_source_id) + + async def check_via_sync_response_order(self, order: SyncResponseOrder): + records = self.sync_response_order_to_setup_record[order] + for record in records: + builder = CheckURLQueryBuilder( + record=record + ) + await self.adb_client.run_query_builder(builder) diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py new file mode 100644 index 00000000..c9055749 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/queries/check.py @@ -0,0 +1,46 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord +from src.db.helpers.session import session_helper as sh + + +class CheckURLQueryBuilder(QueryBuilderBase): + + def __init__(self, record: TestURLPostSetupRecord): + super().__init__() + self.record = record + + async def run(self, session: AsyncSession) -> None: + """Check if url and associated properties match record. + Raises: + AssertionError: if url and associated properties do not match record + """ + query = ( + select(URL) + .options( + selectinload(URL.data_source), + selectinload(URL.confirmed_agencies), + ) + .outerjoin(URLDataSource, URL.id == URLDataSource.url_id) + ) + if self.record.url_id is not None: + query = query.where(URL.id == self.record.url_id) + if self.record.data_sources_id is not None: + query = query.where(URLDataSource.data_source_id == self.record.data_sources_id) + + result = await sh.one_or_none(session=session, query=query) + assert result is not None, f"URL not found for {self.record}" + await self.check_results(result) + + async def check_results(self, url: URL): + assert url.record_type == self.record.final_record_type + assert url.description == self.record.final_description + assert url.name == self.record.final_name + agencies = [agency.agency_id for agency in url.confirmed_agencies] + assert set(agencies) == set(self.record.final_agency_ids) + assert url.outcome == self.record.final_url_status diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py new file mode 100644 index 00000000..2c563f09 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py @@ -0,0 +1,95 @@ +from pendulum import today + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.agency import AgencyAssignmentManager +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.core import TestURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import \ + TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class URLSetupFunctor: + + def __init__( + self, + entry: TestURLSetupEntry, + agency_assignment_manager: AgencyAssignmentManager, + adb_client: AsyncDatabaseClient + ): + self.adb_client = adb_client + self.agency_assignment_manager = agency_assignment_manager + self.prime_entry = entry + self.sc_agency_ids = None + self.ds_agency_ids = None + self.sc_url_id = None + self.ds_response_info = None + + async def __call__(self) -> TestURLPostSetupRecord: + await self.setup_entry() + return TestURLPostSetupRecord( + url_id=self.sc_url_id, + sc_setup_entry=self.prime_entry.sc_info, + ds_setup_entry=self.prime_entry.ds_info, + sc_agency_ids=self.sc_agency_ids, + ds_agency_ids=self.ds_agency_ids, + ds_response_info=self.ds_response_info, + final_url_status=self.prime_entry.final_url_status, + ) + + async def setup_entry(self): + if self.prime_entry.sc_info is not None: + self.sc_url_id = await self.setup_sc_entry(self.prime_entry.sc_info) + if self.prime_entry.ds_info is not None: + self.ds_response_info = await self.setup_ds_entry(self.prime_entry.ds_info) + + async def get_agency_ids(self, ags: list[AgencyAssigned]): + results = [] + for ag in ags: + results.append(await self.agency_assignment_manager.get(ag)) + return results + + async def setup_sc_entry( + self, + entry: TestSCURLSetupEntry + ) -> int: + """Set up source collector entry and return url id.""" + self.sc_agency_ids = await self.get_agency_ids(self.prime_entry.sc_info.agencies_assigned) + url = URL( + url=self.prime_entry.url, + name=entry.name, + description=entry.description, + collector_metadata={}, + outcome=entry.url_status.value, + record_type=entry.record_type.value if entry.record_type is not None else None, + ) + url_id = await self.adb_client.add(url, return_id=True) + links = [] + for ag_id in self.sc_agency_ids: + link = LinkURLAgency(url_id=url_id, agency_id=ag_id) + links.append(link) + await self.adb_client.add_all(links) + return url_id + + async def setup_ds_entry( + self, + ds_entry: TestDSURLSetupEntry + ) -> DataSourcesSyncResponseInnerInfo: + """Set up data source entry and return response info.""" + self.ds_agency_ids = await self.get_agency_ids(self.prime_entry.ds_info.agencies_assigned) + return DataSourcesSyncResponseInnerInfo( + id=ds_entry.id, + url=self.prime_entry.url, + name=ds_entry.name, + description=ds_entry.description, + url_status=ds_entry.url_status, + approval_status=ds_entry.approval_status, + record_type=ds_entry.record_type, + updated_at=today(), + agency_ids=self.ds_agency_ids + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/__init__.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py new file mode 100644 index 00000000..54360b35 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/core.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class TestURLSetupEntry(BaseModel): + url: str + ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB + sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB + + final_url_status: URLStatus diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py new file mode 100644 index 00000000..5112dd1f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/data_sources.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned, SyncResponseOrder + + +class TestDSURLSetupEntry(BaseModel): + """Represents URL previously existing in DS DB. + + These values should overwrite any SC values + """ + id: int # ID of URL in DS App + name: str + description: str | None + url_status: DataSourcesURLStatus + approval_status: ApprovalStatus + record_type: RecordType + agencies_assigned: list[AgencyAssigned] + sync_response_order: SyncResponseOrder diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py new file mode 100644 index 00000000..b16233da --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/post.py @@ -0,0 +1,50 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.data_sources import \ + TestDSURLSetupEntry +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.models.url.source_collector import \ + TestSCURLSetupEntry + + +class TestURLPostSetupRecord(BaseModel): + """Stores a setup entry along with relevant database-generated ids""" + url_id: int | None + sc_setup_entry: TestSCURLSetupEntry | None + ds_setup_entry: TestDSURLSetupEntry | None + sc_agency_ids: list[int] | None + ds_agency_ids: list[int] | None + ds_response_info: DataSourcesSyncResponseInnerInfo | None + final_url_status: URLStatus + + @property + def data_sources_id(self) -> int | None: + if self.ds_setup_entry is None: + return None + return self.ds_setup_entry.id + + @property + def final_record_type(self) -> RecordType: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.record_type + return self.sc_setup_entry.record_type + + @property + def final_name(self) -> str: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.name + return self.sc_setup_entry.name + + @property + def final_description(self) -> str: + if self.ds_setup_entry is not None: + return self.ds_setup_entry.description + return self.sc_setup_entry.description + + @property + def final_agency_ids(self) -> list[int] | None: + if self.ds_setup_entry is not None: + return self.ds_agency_ids + return self.sc_agency_ids \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py new file mode 100644 index 00000000..83092f7e --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/models/url/source_collector.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned + + +class TestSCURLSetupEntry(BaseModel): + """Represents URL previously existing in SC DB. + + These values should be overridden by any DS values + """ + name: str + description: str + record_type: RecordType | None + url_status: URLStatus + agencies_assigned: list[AgencyAssigned] diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py new file mode 100644 index 00000000..0b71b28c --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_happy_path.py @@ -0,0 +1,62 @@ +from unittest.mock import MagicMock, call + +import pytest + +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_data_sources_sync_happy_path( + test_operator: SyncDataSourcesTaskOperator +): + adb_client = test_operator.adb_client + + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + with patch_sync_data_sources( + await manager.get_data_sources_sync_responses([order for order in SyncResponseOrder]) + ): + run_info = await test_operator.run_task(1) + assert_task_run_success(run_info) + mock_func: MagicMock = test_operator.pdap_client.sync_data_sources + + mock_func.assert_has_calls( + [ + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=1 + ) + ), + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=2 + ) + ), + call( + DataSourcesSyncParameters( + cutoff_date=None, + page=3 + ) + ) + ] + ) + await check_sync_concluded(adb_client, check_updated_at=False) + + # Check results according to expectations. + await manager.check_results() + + diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py new file mode 100644 index 00000000..955c33fb --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_interruption.py @@ -0,0 +1,65 @@ +import pytest +from sqlalchemy import select + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager + + + +@pytest.mark.asyncio +async def test_data_sources_sync_interruption( + test_operator: SyncDataSourcesTaskOperator +): + adb_client = test_operator.adb_client + + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + first_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.FIRST] + ) + + with patch_sync_data_sources( + side_effects= + first_response + + [ValueError("test error")] + ): + run_info = await test_operator.run_task(1) + assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message + + await manager.check_via_sync_response_order(SyncResponseOrder.FIRST) + + # Second response should not be processed + with pytest.raises(AssertionError): + await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + + # Check sync state results + sync_state_results = await adb_client.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page == 2 + assert sync_state_results.last_full_sync_at is None + assert sync_state_results.current_cutoff_date is None + + second_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.SECOND, SyncResponseOrder.THIRD] + ) + with patch_sync_data_sources(second_response): + await test_operator.run_task(2) + + await check_sync_concluded(adb_client) + + await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + await manager.check_via_sync_response_order(SyncResponseOrder.THIRD) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py new file mode 100644 index 00000000..f32a12ec --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/test_no_new_results.py @@ -0,0 +1,59 @@ +from datetime import datetime +from unittest.mock import MagicMock + +import pytest + +from src.core.tasks.scheduled.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters +from src.db.models.instantiations.sync_state.data_sources import DataSourcesSyncState +from tests.automated.integration.tasks.scheduled.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.core import patch_sync_data_sources +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.data import ENTRIES +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import SyncResponseOrder +from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.manager.core import \ + DataSourcesSyncTestSetupManager +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_data_sources_sync_no_new_results( + test_operator: SyncDataSourcesTaskOperator +): + adb_client = test_operator.adb_client + + cutoff_date = datetime(2025, 5, 1).date() + + manager = DataSourcesSyncTestSetupManager( + adb_client=adb_client, + entries=ENTRIES + ) + await manager.setup() + + first_response = await manager.get_data_sources_sync_responses( + [SyncResponseOrder.THIRD] + ) + + # Add cutoff date to database + await adb_client.add( + DataSourcesSyncState( + current_cutoff_date=cutoff_date + ) + ) + + with patch_sync_data_sources(first_response): + run_info = await test_operator.run_task(1) + assert_task_run_success(run_info) + mock_func: MagicMock = test_operator.pdap_client.sync_data_sources + + mock_func.assert_called_once_with( + DataSourcesSyncParameters( + cutoff_date=cutoff_date, + page=1 + ) + ) + await check_sync_concluded(adb_client, check_updated_at=False) + + # Check no syncs occurred + for sync_response_order in [SyncResponseOrder.FIRST, SyncResponseOrder.SECOND]: + with pytest.raises(AssertionError): + await manager.check_via_sync_response_order(sync_response_order) diff --git a/tests/automated/integration/tasks/url/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/auto_relevant/test_task.py index 287b5f13..886cec09 100644 --- a/tests/automated/integration/tasks/url/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/auto_relevant/test_task.py @@ -1,9 +1,12 @@ +from collections import Counter + import pytest +from src.collectors.enums import URLStatus from src.db.enums import TaskType -from src.db.models.instantiations.url.core import URL -from src.db.models.instantiations.url.error_info import URLErrorInfo -from src.db.models.instantiations.url.suggestion.relevant.auto import AutoRelevantSuggestion +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_task_has_expected_run_info, \ assert_prereqs_met from tests.automated.integration.tasks.url.auto_relevant.setup import setup_operator, setup_urls @@ -28,8 +31,9 @@ async def test_url_auto_relevant_task(db_data_creator): # Get URLs, confirm one is marked as error urls: list[URL] = await adb_client.get_all(URL) assert len(urls) == 3 - statuses = [url.outcome for url in urls] - assert sorted(statuses) == sorted(["pending", "pending", "error"]) + counter = Counter([url.outcome for url in urls]) + assert counter[URLStatus.ERROR] == 1 + assert counter[URLStatus.PENDING] == 2 # Confirm two annotations were created suggestions: list[AutoRelevantSuggestion] = await adb_client.get_all(AutoRelevantSuggestion) diff --git a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py index cb46c845..816724b8 100644 --- a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py +++ b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py @@ -6,7 +6,7 @@ from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from tests.automated.integration.tasks.url.duplicate.constants import BATCH_CREATION_PARAMETERS @@ -68,7 +68,7 @@ async def test_url_duplicate_task( assert duplicate_url.url_id in url_ids for url in urls: if url.id == duplicate_url.url_id: - assert url.outcome == URLStatus.DUPLICATE.value + assert url.outcome == URLStatus.DUPLICATE checked_for_duplicates: list[URLCheckedForDuplicate] = await adb_client.get_all(URLCheckedForDuplicate) assert len(checked_for_duplicates) == 2 diff --git a/tests/automated/integration/tasks/url/test_agency_preannotation_task.py b/tests/automated/integration/tasks/url/test_agency_preannotation_task.py index 03961fe0..f7b75f51 100644 --- a/tests/automated/integration/tasks/url/test_agency_preannotation_task.py +++ b/tests/automated/integration/tasks/url/test_agency_preannotation_task.py @@ -14,7 +14,7 @@ from src.external.pdap.enums import MatchAgencyResponseStatus from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from src.db.models.instantiations.agency import Agency +from src.db.models.instantiations.agency.sqlalchemy import Agency from src.collectors.enums import CollectorType, URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.subtasks.agency_identification.auto_googler import AutoGooglerAgencyIdentificationSubtask diff --git a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py index 0bdc3718..4254c4ad 100644 --- a/tests/automated/integration/tasks/url/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/test_submit_approved_url_task.py @@ -7,9 +7,9 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType -from src.db.models.instantiations.url.error_info import URLErrorInfo -from src.db.models.instantiations.url.data_source import URLDataSource -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.enums import RecordType, SubmitResponseStatus @@ -139,9 +139,9 @@ async def test_submit_approved_url_task( url_3 = urls[2] # Check URLs have been marked as 'submitted' - assert url_1.outcome == URLStatus.SUBMITTED.value - assert url_2.outcome == URLStatus.SUBMITTED.value - assert url_3.outcome == URLStatus.ERROR.value + assert url_1.outcome == URLStatus.SUBMITTED + assert url_2.outcome == URLStatus.SUBMITTED + assert url_3.outcome == URLStatus.ERROR # Get URL Data Source Links url_data_sources = await db_data_creator.adb_client.get_all(URLDataSource) diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 7a88f759..8966e416 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo @@ -126,10 +126,10 @@ def find_url(url_id: int) -> URL: return url raise Exception(f"URL with id {url_id} not found") - assert find_url(url_id_success).outcome == URLStatus.PENDING.value - assert find_url(url_id_404).outcome == URLStatus.NOT_FOUND.value - assert find_url(url_id_error).outcome == URLStatus.PENDING.value - assert find_url(url_id_initial_error).outcome == URLStatus.ERROR.value + assert find_url(url_id_success).outcome == URLStatus.PENDING + assert find_url(url_id_404).outcome == URLStatus.NOT_FOUND + assert find_url(url_id_error).outcome == URLStatus.PENDING + assert find_url(url_id_initial_error).outcome == URLStatus.ERROR # Check that meets_task_prerequisites now returns False meets_prereqs = await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py index e3d7c529..e9f55240 100644 --- a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py @@ -4,7 +4,7 @@ from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/automated/unit/core/test_core_logger.py b/tests/automated/unit/core/test_core_logger.py index f6738011..580f18bd 100644 --- a/tests/automated/unit/core/test_core_logger.py +++ b/tests/automated/unit/core/test_core_logger.py @@ -3,7 +3,7 @@ import pytest -from src.db.dtos.log import LogInfo +from src.db.models.instantiations.log.pydantic.info import LogInfo from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/unit/db/__init__.py b/tests/automated/unit/db/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/__init__.py b/tests/automated/unit/db/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/__init__.py b/tests/automated/unit/db/utils/validate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/mock/__init__.py b/tests/automated/unit/db/utils/validate/mock/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/unit/db/utils/validate/mock/class_.py b/tests/automated/unit/db/utils/validate/mock/class_.py new file mode 100644 index 00000000..87b0d213 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/mock/class_.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from tests.automated.unit.db.utils.validate.mock.protocol import MockProtocol + + +class MockClassNoProtocol(BaseModel): + mock_attribute: str | None = None + +class MockClassWithProtocol(BaseModel, MockProtocol): + mock_attribute: str | None = None \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/mock/protocol.py b/tests/automated/unit/db/utils/validate/mock/protocol.py new file mode 100644 index 00000000..5a55d0fe --- /dev/null +++ b/tests/automated/unit/db/utils/validate/mock/protocol.py @@ -0,0 +1,7 @@ +from asyncio import Protocol + + +class MockProtocol(Protocol): + + def mock_method(self) -> None: + pass \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py b/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py new file mode 100644 index 00000000..8e325879 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/test_all_models_of_same_type.py @@ -0,0 +1,17 @@ +import pytest + +from src.db.utils.validate import validate_all_models_of_same_type +from tests.automated.unit.db.utils.validate.mock.class_ import MockClassNoProtocol, MockClassWithProtocol + + +def test_validate_all_models_of_same_type_happy_path(): + + models = [MockClassNoProtocol() for _ in range(3)] + validate_all_models_of_same_type(models) + +def test_validate_all_models_of_same_type_error_path(): + + models = [MockClassNoProtocol() for _ in range(2)] + models.append(MockClassWithProtocol()) + with pytest.raises(TypeError): + validate_all_models_of_same_type(models) \ No newline at end of file diff --git a/tests/automated/unit/db/utils/validate/test_has_protocol.py b/tests/automated/unit/db/utils/validate/test_has_protocol.py new file mode 100644 index 00000000..cfb820a3 --- /dev/null +++ b/tests/automated/unit/db/utils/validate/test_has_protocol.py @@ -0,0 +1,17 @@ +import pytest + +from src.db.utils.validate import validate_has_protocol +from tests.automated.unit.db.utils.validate.mock.class_ import MockClassWithProtocol, MockClassNoProtocol +from tests.automated.unit.db.utils.validate.mock.protocol import MockProtocol + + +def test_validate_has_protocol_happy_path(): + + model = MockClassWithProtocol() + validate_has_protocol(model, MockProtocol) + +def test_validate_has_protocol_error_path(): + + model = MockClassNoProtocol() + with pytest.raises(TypeError): + validate_has_protocol(model, MockProtocol) \ No newline at end of file diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 96fbf8c4..2cc91449 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,7 +5,7 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 070f9533..94c3fde6 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,7 +4,7 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index b3e9fec1..672936e0 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,7 +6,7 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO diff --git a/tests/conftest.py b/tests/conftest.py index ee9a6774..4e724563 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Generator, AsyncGenerator, Coroutine +from typing import Any, Generator, AsyncGenerator import pytest import pytest_asyncio @@ -7,11 +7,10 @@ from sqlalchemy import create_engine, inspect, MetaData from sqlalchemy.orm import scoped_session, sessionmaker +from src.core.env_var_manager import EnvVarManager from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient -from src.db.helpers import get_postgres_connection_string -from src.db.models.templates import Base -from src.core.env_var_manager import EnvVarManager +from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import load_from_environment from tests.helpers.alembic_runner import AlembicRunner from tests.helpers.db_data_creator import DBDataCreator diff --git a/tests/helpers/db_data_creator.py b/tests/helpers/db_data_creator.py index 1a1d0a70..a8d8331a 100644 --- a/tests/helpers/db_data_creator.py +++ b/tests/helpers/db_data_creator.py @@ -9,13 +9,13 @@ from src.api.endpoints.review.enums import RejectionReason from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.batch import BatchInfo -from src.db.dtos.duplicate import DuplicateInsertInfo -from src.db.dtos.url.annotations.auto.relevancy import AutoRelevancyAnnotationInput +from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.error import URLErrorPydanticInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.client.sync import DatabaseClient from src.db.dtos.url.raw_html import RawHTMLInfo diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 1741253b..a6bf5234 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core import URL +from src.db.models.instantiations.url.core.sqlalchemy import URL async def populate_database(adb_client: AsyncDatabaseClient) -> None: diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index ae78c5dd..0536a1d9 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,7 +2,7 @@ import dotenv -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index d6f10064..37e71666 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,4 +1,4 @@ -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from src.collectors.source_collectors.ckan import group_search, package_search, organization_search diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 772d4d4a..2e4e0227 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,4 +1,4 @@ -from src.db.dtos.batch import BatchInfo +from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/external/pdap/test_sync_agencies.py b/tests/manual/external/pdap/test_sync_agencies.py index 6d070977..6eeaf7c3 100644 --- a/tests/manual/external/pdap/test_sync_agencies.py +++ b/tests/manual/external/pdap/test_sync_agencies.py @@ -1,7 +1,7 @@ import pytest import time -from src.core.tasks.scheduled.operators.agency_sync.dtos.parameters import AgencySyncParameters +from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters @pytest.mark.asyncio diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 251d123c..bc48da9f 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.core import URLInfo +from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.db_data_creator import DBDataCreator URLS = [