diff --git a/ENV.md b/ENV.md index 01a7e7ca..a6208f8b 100644 --- a/ENV.md +++ b/ENV.md @@ -60,8 +60,6 @@ Note that some tasks/subtasks are themselves enabled by other tasks. | Flag | Description | |-------------------------------------|--------------------------------------------------------------------| | `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `SYNC_AGENCIES_TASK_FLAG` | Synchonize agencies from Data Sources App. | -| `SYNC_DATA_SOURCES_TASK_FLAG` | Synchonize data sources from Data Sources App. | | `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | | `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | | `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | @@ -86,6 +84,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. | | `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. | | `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. | +| `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. | ### Agency ID Subtasks diff --git a/alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py b/alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py new file mode 100644 index 00000000..cf69e8b0 --- /dev/null +++ b/alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py @@ -0,0 +1,127 @@ +"""Add URL record type + +Revision ID: e6a1a1b3bad4 +Revises: 6b3db0c19f9b +Create Date: 2025-09-22 19:16:01.744304 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from src.util.alembic_helpers import url_id_column, created_at_column, id_column + +# revision identifiers, used by Alembic. +revision: str = 'e6a1a1b3bad4' +down_revision: Union[str, None] = '6b3db0c19f9b' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +URL_RECORD_TYPE_TABLE_NAME = "url_record_type" + + + + +def upgrade() -> None: + _create_url_record_type_table() + _migrate_url_record_types_to_url_record_type_table() + _drop_record_type_column() + _drop_agencies_sync_state() + _drop_data_sources_sync_state() + +def _drop_agencies_sync_state(): + op.drop_table("agencies_sync_state") + + +def _drop_data_sources_sync_state(): + op.drop_table("data_sources_sync_state") + + +def _create_data_sources_sync_state(): + table = op.create_table( + "data_sources_sync_state", + id_column(), + sa.Column('last_full_sync_at', sa.DateTime(), nullable=True), + sa.Column('current_cutoff_date', sa.Date(), nullable=True), + sa.Column('current_page', sa.Integer(), nullable=True), + ) + # Add row to `data_sources_sync_state` table + op.bulk_insert( + table, + [ + { + "last_full_sync_at": None, + "current_cutoff_date": None, + "current_page": None + } + ] + ) + + +def _create_agencies_sync_state(): + table = op.create_table( + 'agencies_sync_state', + id_column(), + sa.Column('last_full_sync_at', sa.DateTime(), nullable=True), + sa.Column('current_cutoff_date', sa.Date(), nullable=True), + sa.Column('current_page', sa.Integer(), nullable=True), + ) + + # Add row to `agencies_sync_state` table + op.bulk_insert( + table, + [ + { + "last_full_sync_at": None, + "current_cutoff_date": None, + "current_page": None + } + ] + ) + + +def downgrade() -> None: + _add_record_type_column() + _migrate_url_record_types_from_url_record_type_table() + _drop_url_record_type_table() + _create_agencies_sync_state() + _create_data_sources_sync_state() + +def _drop_record_type_column(): + op.drop_column("urls", "record_type") + +def _add_record_type_column(): + op.add_column("urls", sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=True)) + + +def _create_url_record_type_table(): + op.create_table( + URL_RECORD_TYPE_TABLE_NAME, + url_id_column(primary_key=True), + sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=False), + created_at_column() + ) + + +def _drop_url_record_type_table(): + op.drop_table(URL_RECORD_TYPE_TABLE_NAME) + + +def _migrate_url_record_types_from_url_record_type_table(): + op.execute(""" + UPDATE urls + SET record_type = url_record_type.record_type + FROM url_record_type + WHERE urls.id = url_record_type.url_id + """) + + +def _migrate_url_record_types_to_url_record_type_table(): + op.execute(""" + INSERT INTO url_record_type (url_id, record_type) + SELECT id, record_type + FROM urls + WHERE record_type IS NOT NULL + """) diff --git a/src/api/endpoints/annotate/all/post/models/request.py b/src/api/endpoints/annotate/all/post/models/request.py index e85f2442..13207d4f 100644 --- a/src/api/endpoints/annotate/all/post/models/request.py +++ b/src/api/endpoints/annotate/all/post/models/request.py @@ -12,8 +12,13 @@ class AllAnnotationPostInfo(BaseModel): location_ids: list[int] @model_validator(mode="after") - def forbid_record_type_if_meta_url(self): - if self.suggested_status == URLType.META_URL and self.record_type is not None: + def forbid_record_type_if_meta_url_or_individual_record(self): + if self.suggested_status not in [ + URLType.META_URL, + URLType.INDIVIDUAL_RECORD, + ]: + return self + if self.record_type is not None: raise FailedValidationException("record_type must be None if suggested_status is META_URL") return self @@ -24,31 +29,39 @@ def require_record_type_if_data_source(self): return self @model_validator(mode="after") - def require_location_if_meta_url_or_data_source(self): - if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]: + def require_location_if_relevant(self): + if self.suggested_status not in [ + URLType.META_URL, + URLType.DATA_SOURCE, + URLType.INDIVIDUAL_RECORD, + ]: return self if len(self.location_ids) == 0: raise FailedValidationException("location_ids must be provided if suggested_status is META_URL or DATA_SOURCE") return self @model_validator(mode="after") - def require_agency_id_if_meta_url_or_data_source(self): - if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]: + def require_agency_id_if_relevant(self): + if self.suggested_status not in [ + URLType.META_URL, + URLType.DATA_SOURCE, + URLType.INDIVIDUAL_RECORD, + ]: return self if len(self.agency_ids) == 0: raise FailedValidationException("agencies must be provided if suggested_status is META_URL or DATA_SOURCE") return self @model_validator(mode="after") - def forbid_all_else_if_not_meta_url_or_data_source(self): - if self.suggested_status in [URLType.META_URL, URLType.DATA_SOURCE]: + def forbid_all_else_if_not_relevant(self): + if self.suggested_status != URLType.NOT_RELEVANT: return self if self.record_type is not None: - raise FailedValidationException("record_type must be None if suggested_status is not META_URL or DATA_SOURCE") + raise FailedValidationException("record_type must be None if suggested_status is NOT RELEVANT") if len(self.agency_ids) > 0: - raise FailedValidationException("agency_ids must be empty if suggested_status is not META_URL or DATA_SOURCe") + raise FailedValidationException("agency_ids must be empty if suggested_status is NOT RELEVANT") if len(self.location_ids) > 0: - raise FailedValidationException("location_ids must be empty if suggested_status is not META_URL or DATA_SOURCE") + raise FailedValidationException("location_ids must be empty if suggested_status is NOT RELEVANT") return self diff --git a/src/api/endpoints/annotate/all/post/query.py b/src/api/endpoints/annotate/all/post/query.py index 951d83d6..85861fee 100644 --- a/src/api/endpoints/annotate/all/post/query.py +++ b/src/api/endpoints/annotate/all/post/query.py @@ -33,10 +33,7 @@ async def run(self, session: AsyncSession) -> None: session.add(relevant_suggestion) # If not relevant, do nothing else - if not self.post_info.suggested_status in [ - URLType.META_URL, - URLType.DATA_SOURCE - ]: + if self.post_info.suggested_status == URLType.NOT_RELEVANT: return locations: list[UserLocationSuggestion] = [] diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 73e3edb8..4f8956dc 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -10,6 +10,7 @@ from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase @@ -37,9 +38,9 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: session.add(batch) await session.flush() - batch_id = batch.id - url_ids = [] - duplicate_urls = [] + batch_id: int = batch.id + url_ids: list[int] = [] + duplicate_urls: list[str] = [] for entry in self.dto.entries: url = URL( @@ -48,10 +49,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: description=entry.description, collector_metadata=entry.collector_metadata, status=URLStatus.OK.value, - record_type=entry.record_type.value if entry.record_type is not None else None, source=URLSource.MANUAL ) + async with session.begin_nested(): try: session.add(url) @@ -60,6 +61,15 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: duplicate_urls.append(entry.url) continue await session.flush() + + if entry.record_type is not None: + record_type = URLRecordType( + url_id=url.id, + record_type=entry.record_type, + ) + session.add(record_type) + + link = LinkBatchURL( batch_id=batch_id, url_id=url.id diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index 48f0ecae..a624f53d 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -14,6 +14,7 @@ from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -34,7 +35,7 @@ async def run(self, session: AsyncSession) -> None: url = await self._get_url(session) - await self._optionally_update_record_type(url) + await self._optionally_update_record_type(session) # Get existing agency ids existing_agencies = url.confirmed_agencies or [] @@ -88,14 +89,15 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None: self.approval_info.supplying_entity ) - async def _optionally_update_record_type(self, url: URL) -> None: - update_if_not_none( - url, - "record_type", - self.approval_info.record_type.value - if self.approval_info.record_type is not None else None, - required=True + async def _optionally_update_record_type(self, session: AsyncSession) -> None: + if self.approval_info.record_type is None: + return + + record_type = URLRecordType( + url_id=self.approval_info.url_id, + record_type=self.approval_info.record_type.value ) + session.add(record_type) async def _get_url(self, session: AsyncSession) -> URL: query = ( diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 886bd65d..5b6bd08d 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -10,6 +10,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html @@ -33,10 +34,14 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut select( URL.id.label(label_url_id), URL.url.label(label_url), - URL.record_type.label(label_record_type_fine), + URLRecordType.record_type.label(label_record_type_fine), URLCompressedHTML.compressed_html.label(label_html), FlagURLValidated.type.label(label_type) ) + .join( + URLRecordType, + URL.id == URLRecordType.url_id + ) .join( URLCompressedHTML, URL.id == URLCompressedHTML.url_id diff --git a/src/core/tasks/scheduled/impl/sync/agency/dtos/parameters.py b/src/core/tasks/scheduled/impl/sync/agency/dtos/parameters.py deleted file mode 100644 index 5afa53f1..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/dtos/parameters.py +++ /dev/null @@ -1,9 +0,0 @@ -from datetime import date -from typing import Optional - -from pydantic import BaseModel - - -class AgencySyncParameters(BaseModel): - cutoff_date: date | None - page: int | None diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py deleted file mode 100644 index 6adff30b..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ /dev/null @@ -1,56 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.core import UpsertAgenciesQueryBuilder -from src.core.tasks.scheduled.impl.sync.check import check_max_sync_requests_not_exceeded -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class SyncAgenciesTaskOperator(ScheduledTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self) -> TaskType: # - return TaskType.SYNC_AGENCIES - - async def inner_task_logic(self): - params = await self.adb_client.get_agencies_sync_parameters() - if params.page is None: - params.page = 1 - - response = await self.pdap_client.sync_agencies(params) - count_agencies_synced = 0 - request_count = 0 - while len(response.agencies) > 0: - await self.update_data(response.agencies) - count_agencies_synced += len(response.agencies) - request_count += 1 - - check_max_sync_requests_not_exceeded(request_count) - - params = AgencySyncParameters( - page=params.page + 1, - cutoff_date=params.cutoff_date - ) - await self.adb_client.update_agencies_sync_progress(params.page) - - response = await self.pdap_client.sync_agencies(params) - - - await self.adb_client.mark_full_agencies_sync() - print(f"Sync complete. Synced {count_agencies_synced} agencies") - - async def update_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): - # First, add new agencies - await self.adb_client.run_query_builder( - UpsertAgenciesQueryBuilder(agencies) - ) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py deleted file mode 100644 index 0e81e97d..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py +++ /dev/null @@ -1,30 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.exc import NoResultFound -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.db.models.impl.state.sync.agencies import AgenciesSyncState -from src.db.queries.base.builder import QueryBuilderBase - - -class GetAgenciesSyncParametersQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> AgencySyncParameters: - query = select( - AgenciesSyncState.current_page, - AgenciesSyncState.current_cutoff_date - ) - try: - result = (await session.execute(query)).mappings().one() - return AgencySyncParameters( - page=result['current_page'], - cutoff_date=result['current_cutoff_date'] - ) - except NoResultFound: - # Add value - state = AgenciesSyncState() - session.add(state) - return AgencySyncParameters(page=None, cutoff_date=None) - - - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py b/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py deleted file mode 100644 index c578c4ea..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py +++ /dev/null @@ -1,13 +0,0 @@ -from sqlalchemy import update, func, text, Update - -from src.db.models.impl.state.sync.agencies import AgenciesSyncState - - -def get_mark_full_agencies_sync_query() -> Update: - return update( - AgenciesSyncState - ).values( - last_full_sync_at=func.now(), - current_cutoff_date=func.now() - text('interval \'1 day\''), - current_page=None - ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py b/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py deleted file mode 100644 index 2cebb046..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py +++ /dev/null @@ -1,11 +0,0 @@ -from sqlalchemy import Update, update - -from src.db.models.impl.state.sync.agencies import AgenciesSyncState - - -def get_update_agencies_sync_progress_query(page: int) -> Update: - return update( - AgenciesSyncState - ).values( - current_page=page - ) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py deleted file mode 100644 index 4b944464..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py +++ /dev/null @@ -1,20 +0,0 @@ -from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -def convert_agencies_sync_response_to_agencies_upsert( - agencies: list[AgenciesSyncResponseInnerInfo] -) -> list[AgencyUpsertModel]: - results = [] - for agency in agencies: - results.append( - AgencyUpsertModel( - agency_id=agency.agency_id, - name=agency.display_name, - state=agency.state_name, - county=agency.county_name, - locality=agency.locality_name, - ds_last_updated_at=agency.updated_at - ) - ) - return results diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py deleted file mode 100644 index fc909e48..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py +++ /dev/null @@ -1,30 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.core import UpdateAgencyURLLinksQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.core import UpsertMetaUrlsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ - convert_agencies_sync_response_to_agencies_upsert -from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - -from src.db.helpers.session import session_helper as sh - -class UpsertAgenciesQueryBuilder(QueryBuilderBase): - - def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): - super().__init__() - self.sync_responses = sync_responses - - async def run(self, session: AsyncSession) -> None: - # Upsert Agencies - agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.sync_responses) - await sh.bulk_upsert(session=session, models=agency_upserts) - - # Add and update Meta URLs - meta_urls_query_builder = UpsertMetaUrlsQueryBuilder(self.sync_responses) - await meta_urls_query_builder.run(session=session) - - # Add and remove URL-Agency Links - update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(self.sync_responses) - await update_url_links_query_builder.run(session=session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py deleted file mode 100644 index c05b55f1..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py +++ /dev/null @@ -1,12 +0,0 @@ -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -def extract_urls_from_agencies_sync_response( - responses: list[AgenciesSyncResponseInnerInfo] -) -> list[str]: - url_set: set[str] = set() - for response in responses: - for url in response.meta_urls: - url_set.add(url) - - return list(url_set) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py deleted file mode 100644 index 5511ea65..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py +++ /dev/null @@ -1,23 +0,0 @@ -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from src.util.url_mapper import URLMapper - -def build_links_from_url_mappings_and_sync_responses( - url_mappings: list[URLMapping], - sync_responses: list[AgenciesSyncResponseInnerInfo], -) -> list[LinkURLAgencyPydantic]: - - links: list[LinkURLAgencyPydantic] = [] - - mapper = URLMapper(url_mappings) - for sync_response in sync_responses: - agency_id: int = sync_response.agency_id - for meta_url in sync_response.meta_urls: - url_id: int = mapper.get_id(meta_url) - link = LinkURLAgencyPydantic( - agency_id=agency_id, - url_id=url_id - ) - links.append(link) - return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py deleted file mode 100644 index 37d63a03..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py +++ /dev/null @@ -1,50 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.build import \ - build_links_from_url_mappings_and_sync_responses -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.extract import \ - extract_agency_ids_from_agencies_sync_response -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): - """Updates agency URL links.""" - - def __init__( - self, - sync_responses: list[AgenciesSyncResponseInnerInfo] - ): - super().__init__() - self._sync_responses = sync_responses - - async def run(self, session: AsyncSession) -> None: - # Get all existing links - requester = UpdateAgencyURLLinksRequester(session) - - # Build new links from sync responses and URL mappings - sync_urls: list[str] = extract_urls_from_agencies_sync_response(self._sync_responses) - url_mappings: list[URLMapping] = await requester.get_url_mappings(urls=sync_urls) - new_links: list[LinkURLAgencyPydantic] = build_links_from_url_mappings_and_sync_responses( - url_mappings=url_mappings, - sync_responses=self._sync_responses, - ) - - sync_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) - old_links: list[LinkURLAgencyPydantic] = await requester.get_current_agency_url_links( - agency_ids=sync_agency_ids, - ) - - new_set: set[LinkURLAgencyPydantic] = set(new_links) - old_set: set[LinkURLAgencyPydantic] = set(old_links) - - links_to_add: list[LinkURLAgencyPydantic] = list(new_set - old_set) - links_to_remove: list[LinkURLAgencyPydantic] = list(old_set - new_set) - - await requester.add_agency_url_links(links=links_to_add) - await requester.remove_agency_url_links(links=links_to_remove) - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py deleted file mode 100644 index 123bd0ba..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py +++ /dev/null @@ -1,12 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings - - -def filter_non_relevant_mappings( - mappings: list[AgencyURLMappings], - relevant_agency_ids: list[int] -) -> list[AgencyURLMappings]: - relevant_mappings: list[AgencyURLMappings] = [] - for mapping in mappings: - if mapping.agency_id in relevant_agency_ids: - relevant_mappings.append(mapping) - return relevant_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py deleted file mode 100644 index 9a083719..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class LookupMetaURLLinksQueryBuilder(QueryBuilderBase): - - def __init__(self, agency_ids: list[int]): - super().__init__() - self.agency_ids: list[int] = agency_ids - - async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: - - query = ( - select( - LinkURLAgency.url_id, - LinkURLAgency.agency_id - ) - .join( - URL, - LinkURLAgency.url_id == URL.id, - ) - .join( - FlagURLValidated, - FlagURLValidated.url_id == URL.id, - ) - .where( - FlagURLValidated.type == URLType.META_URL, - LinkURLAgency.agency_id.in_(self.agency_ids), - ) - ) - - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - links: list[LinkURLAgencyPydantic] = [ - LinkURLAgencyPydantic(**mapping) for mapping in mappings - ] - return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py deleted file mode 100644 index 8b526447..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh - -class LookupURLQueryBuilder(QueryBuilderBase): - - def __init__(self, urls: list[str]): - super().__init__() - self.urls: list[str] = urls - - async def run(self, session: AsyncSession) -> list[URLMapping]: - query = ( - select( - URL.id.label("url_id"), - URL.url, - ) - .where( - URL.url.in_(self.urls), - ) - ) - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - urls: list[URLMapping] = [ - URLMapping(**mapping) for mapping in mappings - ] - return urls \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py deleted file mode 100644 index 0f3c9d69..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py +++ /dev/null @@ -1,6 +0,0 @@ -from pydantic import BaseModel - - -class AgencyURLMappings(BaseModel): - agency_id: int - url_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py deleted file mode 100644 index 96887dfa..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py +++ /dev/null @@ -1,21 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.links import LookupMetaURLLinksQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.url import LookupURLQueryBuilder -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.templates.requester import RequesterBase - - -class UpdateAgencyURLLinksRequester(RequesterBase): - - async def get_url_mappings(self, urls: list[str]) -> list[URLMapping]: - return await LookupURLQueryBuilder(urls=urls).run(session=self.session) - - async def get_current_agency_url_links(self, agency_ids: list[int]) -> list[LinkURLAgencyPydantic]: - return await LookupMetaURLLinksQueryBuilder(agency_ids=agency_ids).run(session=self.session) - - async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: - await sh.bulk_insert(self.session, models=links) - - async def remove_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: - await sh.bulk_delete(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py deleted file mode 100644 index f1bf793d..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py +++ /dev/null @@ -1,57 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.enums import RecordType -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.db.queries.base.builder import QueryBuilderBase - -from src.db.helpers.session import session_helper as sh - -class AddMetaURLsQueryBuilder(QueryBuilderBase): - - """Add Meta URLs to DB with: - - Record type set to CONTACT_INFO_AND_AGENCY_META - - Validation Flag added as META_URL - - Source set to DATA_SOURCES - """ - def __init__(self, urls: list[str]): - super().__init__() - self.urls = urls - - async def run(self, session: AsyncSession) -> list[URLMapping]: - # Add URLs - url_inserts: list[URLInsertModel] = [] - for url in self.urls: - url_inserts.append( - URLInsertModel( - url=url, - record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, - source=URLSource.DATA_SOURCES - ) - ) - url_ids: list[int] = await sh.bulk_insert(session, models=url_inserts, return_ids=True) - - # Connect with URLs - mappings: list[URLMapping] = [ - URLMapping( - url=url, - url_id=url_id, - ) - for url, url_id in zip(self.urls, url_ids) - ] - - # Add Validation Flags - flag_inserts: list[FlagURLValidatedPydantic] = [] - for url_id in url_ids: - flag_inserts.append( - FlagURLValidatedPydantic( - url_id=url_id, - type=URLType.META_URL - ) - ) - await sh.bulk_insert(session, models=flag_inserts) - - return mappings diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py deleted file mode 100644 index 8d3e8785..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py +++ /dev/null @@ -1,27 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams -from src.db.dtos.url.mapping import URLMapping - - -def convert_to_update_meta_urls_params( - lookups: list[MetaURLLookupResponse] -) -> list[UpdateMetaURLsParams]: - return [ - UpdateMetaURLsParams( - url_id=lookup.url_id, - validation_type=lookup.validation_type, - record_type=lookup.record_type, - ) - for lookup in lookups - ] - -def convert_url_lookups_to_url_mappings( - lookups: list[MetaURLLookupResponse] -) -> list[URLMapping]: - return [ - URLMapping( - url_id=lookup.url_id, - url=lookup.url, - ) - for lookup in lookups - ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py deleted file mode 100644 index 6f5c3593..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py +++ /dev/null @@ -1,33 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -class UpsertMetaUrlsQueryBuilder(QueryBuilderBase): - """Add and update meta URLs for agencies.""" - - def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): - super().__init__() - self.sync_responses = sync_responses - - async def run(self, session: AsyncSession) -> None: - - requester = UpdateMetaURLsRequester(session) - sync_urls: list[str] = extract_urls_from_agencies_sync_response(self.sync_responses) - - - lookup_responses: list[MetaURLLookupResponse] = \ - await requester.lookup_meta_urls(sync_urls) - await requester.add_new_urls_to_database(lookup_responses) - - filtered_lookup_responses: list[MetaURLLookupResponse] = \ - filter_urls_in_sync(self.sync_responses, lookup_responses=lookup_responses) - await requester.update_existing_urls(filtered_lookup_responses) - - - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py deleted file mode 100644 index 227f0edc..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py +++ /dev/null @@ -1,37 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -def filter_urls_to_add( - lookup_responses: list[MetaURLLookupResponse] -) -> list[str]: - return [ - lookup_response.url - for lookup_response in lookup_responses - if not lookup_response.exists_in_db - ] - -def filter_existing_url_mappings( - lookup_responses: list[MetaURLLookupResponse] -) -> list[MetaURLLookupResponse]: - """Filter only URL mappings that already exist in the database.""" - return [ - lookup_response - for lookup_response in lookup_responses - if lookup_response.exists_in_db - ] - -def filter_urls_in_sync( - sync_responses: list[AgenciesSyncResponseInnerInfo], - lookup_responses: list[MetaURLLookupResponse] -) -> list[MetaURLLookupResponse]: - """Filter only URLs that are in sync responses.""" - sync_urls: set[str] = set( - extract_urls_from_agencies_sync_response(sync_responses) - ) - filtered_lookup_responses: list[MetaURLLookupResponse] = [] - for lookup_response in lookup_responses: - if lookup_response.url in sync_urls: - filtered_lookup_responses.append(lookup_response) - return filtered_lookup_responses \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py deleted file mode 100644 index 8a817bd4..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class LookupMetaURLsQueryBuilder(QueryBuilderBase): - """Lookup whether URLs exist in DB and are validated as meta URLs""" - - def __init__(self, urls: list[str]): - super().__init__() - self.urls: list[str] = urls - - async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: - url_id_label: str = "url_id" - - query = ( - select( - URL.id.label(url_id_label), - URL.url, - URL.record_type, - FlagURLValidated.type - ) - .select_from( - URL - ) - .outerjoin( - FlagURLValidated, - FlagURLValidated.url_id == URL.id, - ) - .where( - URL.url.in_(self.urls) - ) - ) - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - - urls_in_db = set() - extant_lookup_responses: list[MetaURLLookupResponse] = [] - for mapping in mappings: - url = mapping["url"] - urls_in_db.add(url) - response = MetaURLLookupResponse( - url=url, - url_id=mapping[url_id_label], - record_type=mapping["record_type"], - validation_type=mapping["type"], - ) - extant_lookup_responses.append(response) - - urls_not_in_db = set(self.urls) - set(urls_in_db) - non_extant_lookup_responses = [ - MetaURLLookupResponse( - url=url, - url_id=None, - record_type=None, - validation_type=None, - ) for url in urls_not_in_db - ] - - return extant_lookup_responses + non_extant_lookup_responses diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py deleted file mode 100644 index d054f645..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py +++ /dev/null @@ -1,10 +0,0 @@ -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo - - -def extract_agency_ids_from_agencies_sync_response( - responses: list[AgenciesSyncResponseInnerInfo] -) -> list[int]: - return [ - response.agency_id - for response in responses - ] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py deleted file mode 100644 index da33244e..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py +++ /dev/null @@ -1,23 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType - - -class MetaURLLookupResponse(BaseModel): - url: str - url_id: int | None - record_type: RecordType | None - validation_type: URLType | None - - @property - def exists_in_db(self) -> bool: - return self.url_id is not None - - @property - def is_meta_url(self) -> bool: - return self.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META - - @property - def is_validated(self) -> bool: - return self.validation_type is not None diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py deleted file mode 100644 index 0a3e3c76..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py +++ /dev/null @@ -1,48 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ - convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ - filter_urls_to_add -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams -from src.db.dtos.url.mapping import URLMapping -from src.db.templates.requester import RequesterBase - - -class UpdateMetaURLsRequester(RequesterBase): - - async def lookup_meta_urls( - self, - urls: list[str] - ) -> list[MetaURLLookupResponse]: - return await LookupMetaURLsQueryBuilder( - urls - ).run(self.session) - - async def add_new_urls_to_database( - self, - lookup_responses: list[MetaURLLookupResponse] - ) -> list[URLMapping]: - if len(lookup_responses) == 0: - return [] - urls_to_add: list[str] = filter_urls_to_add(lookup_responses) - if len(urls_to_add) == 0: - return [] - return await AddMetaURLsQueryBuilder(urls_to_add).run(self.session) - - async def update_existing_urls( - self, - lookup_responses: list[MetaURLLookupResponse] - ) -> list[URLMapping]: - existing_url_lookups: list[MetaURLLookupResponse] = ( - filter_existing_url_mappings(lookup_responses) - ) - params: list[UpdateMetaURLsParams] = \ - convert_to_update_meta_urls_params(existing_url_lookups) - await UpdateMetaURLsQueryBuilder(params).run(self.session) - existing_url_mappings: list[URLMapping] = \ - convert_url_lookups_to_url_mappings(existing_url_lookups) - return existing_url_mappings - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py deleted file mode 100644 index 1e479652..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py +++ /dev/null @@ -1,39 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.filter import \ - filter_urls_with_non_meta_record_type, filter_urls_with_non_meta_url_validation_flag, \ - filter_urls_without_validation_flag -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.requester import \ - UpdateMetaURLsUpdateURLAndValidationFlagsRequester -from src.db.queries.base.builder import QueryBuilderBase - - -class UpdateMetaURLsQueryBuilder(QueryBuilderBase): - """Update meta URLs in DB - - Meta URLs should be given a validation status as a Meta URL - and have their record type updated to CONTACT_INFO_AND_AGENCY_META - """ - - def __init__( - self, - params: list[UpdateMetaURLsParams] - ): - super().__init__() - self.params = params - - async def run( - self, - session: AsyncSession - ) -> None: - requester = UpdateMetaURLsUpdateURLAndValidationFlagsRequester(session) - - urls_with_non_meta_record_type: list[int] = filter_urls_with_non_meta_record_type(self.params) - await requester.update_urls(urls_with_non_meta_record_type) - - urls_without_validation_flag: list[int] = filter_urls_without_validation_flag(self.params) - await requester.add_validation_flags(urls_without_validation_flag) - - urls_with_non_meta_url_validation_flag: list[int] = filter_urls_with_non_meta_url_validation_flag(self.params) - await requester.update_validation_flags(urls_with_non_meta_url_validation_flag) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py deleted file mode 100644 index 74cae709..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py +++ /dev/null @@ -1,37 +0,0 @@ -from src.core.enums import RecordType -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams -from src.db.models.impl.flag.url_validated.enums import URLType - - -def filter_urls_with_non_meta_record_type( - params: list[UpdateMetaURLsParams] -) -> list[int]: - url_ids: list[int] = [] - for param in params: - if param.record_type is None: - url_ids.append(param.url_id) - if param.record_type != RecordType.CONTACT_INFO_AND_AGENCY_META: - url_ids.append(param.url_id) - - return url_ids - -def filter_urls_without_validation_flag( - params: list[UpdateMetaURLsParams] -) -> list[int]: - url_ids: list[int] = [] - for param in params: - if param.validation_type is None: - url_ids.append(param.url_id) - return url_ids - -def filter_urls_with_non_meta_url_validation_flag( - params: list[UpdateMetaURLsParams] -) -> list[int]: - url_ids: list[int] = [] - for param in params: - if param.validation_type is None: - continue - if param.validation_type != URLType.META_URL: - url_ids.append(param.url_id) - - return url_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py deleted file mode 100644 index c25f3bf1..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType - - -class UpdateMetaURLsParams(BaseModel): - validation_type: URLType | None - url_id: int - record_type: RecordType | None - diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py deleted file mode 100644 index 94cdc401..00000000 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py +++ /dev/null @@ -1,53 +0,0 @@ -from sqlalchemy import update - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.templates.requester import RequesterBase - -from src.db.helpers.session import session_helper as sh - -class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): - - async def update_validation_flags(self, url_ids: list[int]) -> None: - """Set validation flag for URLs to Meta URL""" - query = ( - update( - FlagURLValidated - ) - .where( - FlagURLValidated.url_id.in_(url_ids) - ) - .values( - type=URLType.META_URL - ) - ) - await self.session.execute(query) - - async def add_validation_flags(self, url_ids: list[int]) -> None: - inserts: list[FlagURLValidatedPydantic] = [] - for url_id in url_ids: - flag = FlagURLValidatedPydantic( - url_id=url_id, - type=URLType.META_URL, - ) - inserts.append(flag) - - await sh.bulk_insert(self.session, models=inserts) - - async def update_urls(self, url_ids: list[int]) -> None: - """Update URLs and set record type to Contact Info and Agency Meta""" - query = ( - update( - URL - ) - .values( - record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, - ) - .where( - URL.id.in_(url_ids) - ) - ) - await self.session.execute(query) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/check.py b/src/core/tasks/scheduled/impl/sync/check.py deleted file mode 100644 index 3dfe75dc..00000000 --- a/src/core/tasks/scheduled/impl/sync/check.py +++ /dev/null @@ -1,14 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.constants import MAX_SYNC_REQUESTS -from src.core.tasks.scheduled.impl.sync.exceptions import MaxRequestsExceededError - - -def check_max_sync_requests_not_exceeded(request_count: int) -> None: - """ - Raises: - MaxRequestsExceededError: If the number of requests made exceeds the maximum allowed. - """ - - if request_count > MAX_SYNC_REQUESTS: - raise MaxRequestsExceededError( - f"Max requests in a single task run ({MAX_SYNC_REQUESTS}) exceeded." - ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/constants.py b/src/core/tasks/scheduled/impl/sync/constants.py deleted file mode 100644 index a58a7aca..00000000 --- a/src/core/tasks/scheduled/impl/sync/constants.py +++ /dev/null @@ -1,7 +0,0 @@ - - -""" -Denotes the maximum number of requests to the Agencies Sync endpoint -permissible in a single task run. -""" -MAX_SYNC_REQUESTS = 30 \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/operator.py b/src/core/tasks/scheduled/impl/sync/data_sources/operator.py deleted file mode 100644 index ad595919..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/operator.py +++ /dev/null @@ -1,48 +0,0 @@ -from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase -from src.core.tasks.scheduled.impl.sync.check import check_max_sync_requests_not_exceeded -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.external.pdap.client import PDAPClient - - -class SyncDataSourcesTaskOperator(ScheduledTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self): - return TaskType.SYNC_DATA_SOURCES - - async def inner_task_logic(self): - count_sources_synced = 0 - - params = await self.adb_client.get_data_sources_sync_parameters() - if params.page is None: - params.page = 1 - - response = await self.pdap_client.sync_data_sources(params) - count_sources_synced += len(response.data_sources) - request_count = 1 - while len(response.data_sources) > 0: - check_max_sync_requests_not_exceeded(request_count) - await self.adb_client.upsert_urls_from_data_sources(response.data_sources) - - params = DataSourcesSyncParameters( - page=params.page + 1, - cutoff_date=params.cutoff_date - ) - await self.adb_client.update_data_sources_sync_progress(params.page) - - response = await self.pdap_client.sync_data_sources(params) - count_sources_synced += len(response.data_sources) - request_count += 1 - - await self.adb_client.mark_full_data_sources_sync() - print(f"Sync complete. Synced {count_sources_synced} data sources") diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/params.py deleted file mode 100644 index 8a502ef6..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/params.py +++ /dev/null @@ -1,8 +0,0 @@ -from datetime import date - -from pydantic import BaseModel - - -class DataSourcesSyncParameters(BaseModel): - cutoff_date: date | None - page: int | None diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py deleted file mode 100644 index 114eb758..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py +++ /dev/null @@ -1,27 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.exc import NoResultFound -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from src.db.queries.base.builder import QueryBuilderBase - - -class GetDataSourcesSyncParametersQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> DataSourcesSyncParameters: - query = select( - DataSourcesSyncState.current_page, - DataSourcesSyncState.current_cutoff_date - ) - try: - result = (await session.execute(query)).mappings().one() - return DataSourcesSyncParameters( - page=result['current_page'], - cutoff_date=result['current_cutoff_date'] - ) - except NoResultFound: - # Add value - state = DataSourcesSyncState() - session.add(state) - return DataSourcesSyncParameters(page=None, cutoff_date=None) diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py deleted file mode 100644 index 8d6e0bdb..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py +++ /dev/null @@ -1,13 +0,0 @@ -from sqlalchemy import Update, update, func, text - -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState - - -def get_mark_full_data_sources_sync_query() -> Update: - return update( - DataSourcesSyncState - ).values( - last_full_sync_at=func.now(), - current_cutoff_date=func.now() - text('interval \'1 day\''), - current_page=None - ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py deleted file mode 100644 index d6aaebe0..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py +++ /dev/null @@ -1,11 +0,0 @@ -from sqlalchemy import update, Update - -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState - - -def get_update_data_sources_sync_progress_query(page: int) -> Update: - return update( - DataSourcesSyncState - ).values( - current_page=page - ) diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py deleted file mode 100644 index a265def5..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py +++ /dev/null @@ -1,14 +0,0 @@ -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic - - -def convert_to_link_url_agency_models( - url_id: int, - agency_ids: list[int] -) -> list[LinkURLAgencyPydantic]: - return [ - LinkURLAgencyPydantic( - url_id=url_id, - agency_id=agency_id - ) - for agency_id in agency_ids - ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py deleted file mode 100644 index a000783b..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py +++ /dev/null @@ -1,88 +0,0 @@ -from collections import defaultdict - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyParams -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.queries.base.builder import QueryBuilderBase - - -class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): - """Given a set of URL-Agency links, remove all non-matching non-Meta URL links and add new ones.""" - - - def __init__(self, models: list[UpdateLinkURLAgencyParams]): - super().__init__() - self.models = models - self._new_links: dict[int, list[int]] = { - model.url_id: model.new_agency_ids - for model in self.models - } - self._existing_links: dict[int, list[int]] = defaultdict(list) - self.existing_url_ids: set[int] = { - model.url_id for model in self.models - } - - async def _get_existing_links(self, session: AsyncSession) -> None: - """Get existing non-meta URL agency links for provided URL IDs. - - Modifies: - self._existing_links - """ - query = ( - select(LinkURLAgency) - .outerjoin( - FlagURLValidated, - FlagURLValidated.url_id == LinkURLAgency.url_id, - ) - .where( - LinkURLAgency.url_id.in_( - self.existing_url_ids - ), - FlagURLValidated.type != URLType.META_URL - ) - ) - links = await session.scalars(query) - for link in links: - self._existing_links[link.url_id].append(link.agency_id) - - async def _update_links(self, session: AsyncSession) -> None: - # Remove all existing links not in new links - links_to_delete: list[LinkURLAgencyPydantic] = [] - links_to_insert: list[LinkURLAgencyPydantic] = [] - - for url_id in self.existing_url_ids: - new_agency_ids = self._new_links.get(url_id, []) - existing_agency_ids = self._existing_links.get(url_id, []) - # IDs to delete are existing agency ids that are not new agency ids - ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) - # IDs to insert are new agency ids that are not existing agency ids - ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) - - links_to_delete.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_delete) - ) - ) - links_to_insert.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_insert) - ) - ) - - await sh.bulk_delete(session=session, models=links_to_delete) - await sh.bulk_insert(session=session, models=links_to_insert) - - async def run(self, session: AsyncSession) -> None: - await self._get_existing_links(session=session) - await self._update_links(session=session) - - diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py deleted file mode 100644 index 6f8a14eb..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydantic import BaseModel - - -class UpdateLinkURLAgencyParams(BaseModel): - url_id: int - new_agency_ids: list[int] - old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py deleted file mode 100644 index ed5ff8ac..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py +++ /dev/null @@ -1,24 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import URLDataSyncInfo -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import URLType -from src.external.pdap.enums import ApprovalStatus - - -def convert_url_sync_info_to_url_mappings( - url_sync_info: URLDataSyncInfo -) -> URLMapping: - return URLMapping( - url=url_sync_info.url, - url_id=url_sync_info.url_id - ) - -def convert_approval_status_to_validated_type( - approval_status: ApprovalStatus -) -> URLType: - match approval_status: - case ApprovalStatus.APPROVED: - return URLType.DATA_SOURCE - case ApprovalStatus.REJECTED: - return URLType.NOT_RELEVANT - case _: - raise ValueError(f"Invalid approval status: {approval_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py deleted file mode 100644 index 2b021045..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import final - -from sqlalchemy.ext.asyncio import AsyncSession -from typing_extensions import override - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import convert_url_sync_info_to_url_mappings -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ - get_mappings_for_urls_without_data_sources -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.param_manager import \ - UpsertURLsFromDataSourcesParamManager -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.requester import UpsertURLsFromDataSourcesDBRequester -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ - InsertURLForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ - LookupURLForDataSourcesSyncResponse -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from src.util.url_mapper import URLMapper - - -@final -class UpsertURLsFromDataSourcesQueryBuilder(QueryBuilderBase): - - def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): - super().__init__() - self.sync_infos = sync_infos - self.urls = {sync_info.url for sync_info in self.sync_infos} - self.param_manager = UpsertURLsFromDataSourcesParamManager( - mapper=URLSyncInfoMapper(self.sync_infos) - ) - self._session: AsyncSession | None = None - self._requester: UpsertURLsFromDataSourcesDBRequester | None = None - # Need to be able to add URL ids first before adding links or other attributes - - @property - def requester(self) -> UpsertURLsFromDataSourcesDBRequester: - """ - Modifies: - self._requester - """ - if self._requester is None: - self._requester = UpsertURLsFromDataSourcesDBRequester(self._session) - return self._requester - - @override - async def run(self, session: AsyncSession) -> None: - """ - Modifies: - self._session - """ - self._session = session - - lookup_results: list[LookupURLForDataSourcesSyncResponse] = await self._lookup_urls() - - # Update existing url and associated metadata - lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse] = filter_for_urls_with_ids(lookup_results) - await self._update_existing_urls(lookups_existing_urls) - await self._update_agency_link(lookups_existing_urls) - existing_url_mappings: list[URLMapping] = [ - convert_url_sync_info_to_url_mappings(lookup.url_info) - for lookup in lookups_existing_urls - ] - - # Add new URLs and associated metadata - mappings_without_data_sources: list[URLMapping] = get_mappings_for_urls_without_data_sources(lookup_results) - await self._add_new_data_sources(mappings_without_data_sources) - extant_urls: set[str] = {lookup.url_info.url for lookup in lookups_existing_urls} - urls_to_add: list[str] = list(self.urls - extant_urls) - if len(urls_to_add) != 0: - new_url_mappings: list[URLMapping] = await self._add_new_urls(urls_to_add) - await self._add_new_data_sources(new_url_mappings) - await self._insert_agency_link(new_url_mappings) - else: - new_url_mappings: list[URLMapping] = [] - - # Upsert validated flags - all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings - mapper = URLMapper(all_url_mappings) - await self._upsert_validated_flags(mapper) - - async def _lookup_urls(self) -> list[LookupURLForDataSourcesSyncResponse]: - return await self.requester.lookup_urls(list(self.urls)) - - async def _insert_agency_link(self, url_mappings: list[URLMapping]): - link_url_agency_insert_params = self.param_manager.insert_agency_link( - url_mappings - ) - await self.requester.add_new_agency_links(link_url_agency_insert_params) - - async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): - link_url_agency_update_params = self.param_manager.update_agency_link( - lookups_existing_urls - ) - await self.requester.update_agency_links(link_url_agency_update_params) - - async def _add_new_data_sources(self, url_mappings: list[URLMapping]) -> None: - url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) - await self.requester.add_new_data_sources(url_ds_insert_params) - - async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: - url_insert_params: list[InsertURLForDataSourcesSyncParams] = self.param_manager.add_new_urls(urls) - url_mappings = await self.requester.add_new_urls(url_insert_params) - return url_mappings - - async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]) -> None: - update_params = self.param_manager.update_existing_urls(lookups_existing_urls) - await self.requester.update_existing_urls(update_params) - - async def _upsert_validated_flags(self, url_mapper: URLMapper) -> None: - flags: list[FlagURLValidatedPydantic] = self.param_manager.upsert_validated_flags(url_mapper) - await self.requester.upsert_validated_flags(flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py deleted file mode 100644 index 168f2511..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py +++ /dev/null @@ -1,64 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ - InsertURLForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ - UpdateURLForDataSourcesSyncParams -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus - - -def convert_to_source_collector_url_status( - ds_url_status: DataSourcesURLStatus, - ds_approval_status: ApprovalStatus -) -> URLStatus: - match ds_url_status: - case DataSourcesURLStatus.AVAILABLE: - raise NotImplementedError("Logic not implemented for this status.") - case DataSourcesURLStatus.NONE_FOUND: - raise NotImplementedError("Logic not implemented for this status.") - case DataSourcesURLStatus.BROKEN: - return URLStatus.NOT_FOUND - case _: - pass - - match ds_approval_status: - case ApprovalStatus.APPROVED: - return URLStatus.OK - case ApprovalStatus.REJECTED: - return URLStatus.NOT_RELEVANT - case ApprovalStatus.NEEDS_IDENTIFICATION: - return URLStatus.OK - case ApprovalStatus.PENDING: - return URLStatus.OK - case _: - raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") - -def convert_to_url_update_params( - url_id: int, - sync_info: DataSourcesSyncResponseInnerInfo -) -> UpdateURLForDataSourcesSyncParams: - return UpdateURLForDataSourcesSyncParams( - id=url_id, - name=sync_info.name, - description=sync_info.description, - status=convert_to_source_collector_url_status( - ds_url_status=sync_info.url_status, - ds_approval_status=sync_info.approval_status - ), - record_type=sync_info.record_type - ) - -def convert_to_url_insert_params( - url: str, - sync_info: DataSourcesSyncResponseInnerInfo -) -> InsertURLForDataSourcesSyncParams: - return InsertURLForDataSourcesSyncParams( - url=url, - name=sync_info.name, - description=sync_info.description, - status=convert_to_source_collector_url_status( - ds_url_status=sync_info.url_status, - ds_approval_status=sync_info.approval_status - ), - record_type=sync_info.record_type - ) diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/filter.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/filter.py deleted file mode 100644 index d7e6ba73..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/filter.py +++ /dev/null @@ -1,29 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ - LookupURLForDataSourcesSyncResponse -from src.db.dtos.url.mapping import URLMapping - - -def filter_for_urls_with_ids( - lookup_results: list[LookupURLForDataSourcesSyncResponse] -) -> list[LookupURLForDataSourcesSyncResponse]: - return [ - lookup_result - for lookup_result in lookup_results - if lookup_result.url_info.url_id is not None - ] - -def get_mappings_for_urls_without_data_sources( - lookup_results: list[LookupURLForDataSourcesSyncResponse] -) -> list[URLMapping]: - lookups_without_data_sources = [ - lookup_result - for lookup_result in lookup_results - if lookup_result.data_source_id is None - ] - return [ - URLMapping( - url_id=lookup_result.url_info.url_id, - url=lookup_result.url_info.url - ) - for lookup_result in lookups_without_data_sources - ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/mapper.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/mapper.py deleted file mode 100644 index a60904a0..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/mapper.py +++ /dev/null @@ -1,13 +0,0 @@ -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo - - -class URLSyncInfoMapper: - - def __init__(self, sync_infos: list[DataSourcesSyncResponseInnerInfo]): - self._dict: dict[str, DataSourcesSyncResponseInnerInfo] = { - sync_info.url: sync_info - for sync_info in sync_infos - } - - def get(self, url: str) -> DataSourcesSyncResponseInnerInfo: - return self._dict[url] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py deleted file mode 100644 index dd45f727..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ /dev/null @@ -1,126 +0,0 @@ -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import \ - convert_approval_status_to_validated_type -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ - convert_to_url_insert_params -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ - InsertURLForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ - LookupURLForDataSourcesSyncResponse -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ - UpdateURLForDataSourcesSyncParams -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import ApprovalStatus -from src.util.url_mapper import URLMapper - - -class UpsertURLsFromDataSourcesParamManager: - def __init__( - self, - mapper: URLSyncInfoMapper - ): - self._mapper = mapper - - def update_existing_urls( - self, - lookup_results: list[LookupURLForDataSourcesSyncResponse] - ) -> list[UpdateURLForDataSourcesSyncParams]: - results = [] - for lookup_result in lookup_results: - url_info = lookup_result.url_info - sync_info = self._mapper.get(url_info.url) - update_params = convert_to_url_update_params( - url_id=url_info.url_id, - sync_info=sync_info - ) - results.append(update_params) - return results - - def add_new_urls( - self, - urls: list[str] - ) -> list[InsertURLForDataSourcesSyncParams]: - results = [] - for url in urls: - sync_info = self._mapper.get(url) - insert_params = convert_to_url_insert_params( - url=url, - sync_info=sync_info - ) - results.append(insert_params) - return results - - def update_agency_link( - self, - lookup_results: list[LookupURLForDataSourcesSyncResponse] - ) -> list[UpdateLinkURLAgencyParams]: - results = [] - for lookup_result in lookup_results: - url_info = lookup_result.url_info - sync_info = self._mapper.get(url_info.url) - update_params = UpdateLinkURLAgencyParams( - url_id=url_info.url_id, - new_agency_ids=sync_info.agency_ids, - old_agency_ids=url_info.agency_ids - ) - results.append(update_params) - return results - - def insert_agency_link( - self, - url_mappings: list[URLMapping] - ) -> list[LinkURLAgencyPydantic]: - results = [] - for mapping in url_mappings: - sync_info = self._mapper.get(mapping.url) - for agency_id in sync_info.agency_ids: - results.append( - LinkURLAgencyPydantic( - url_id=mapping.url_id, - agency_id=agency_id - ) - ) - - return results - - def add_new_data_sources( - self, - mappings: list[URLMapping] - ) -> list[URLDataSourcePydantic]: - results = [] - for mapping in mappings: - sync_info = self._mapper.get(mapping.url) - results.append( - URLDataSourcePydantic( - data_source_id=sync_info.id, - url_id=mapping.url_id - ) - ) - return results - - def upsert_validated_flags( - self, - mapper: URLMapper - ) -> list[FlagURLValidatedPydantic]: - urls: list[str] = mapper.get_all_urls() - flags: list[FlagURLValidatedPydantic] = [] - for url in urls: - url_id: int = mapper.get_id(url) - sync_info: DataSourcesSyncResponseInnerInfo = self._mapper.get(url) - approval_status: ApprovalStatus = sync_info.approval_status - validated_type: URLType = convert_approval_status_to_validated_type(approval_status) - flag = FlagURLValidatedPydantic( - url_id=url_id, - type=validated_type - ) - flags.append(flag) - - return flags \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py deleted file mode 100644 index eaae3a17..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ /dev/null @@ -1,82 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.core import \ - URLAgencyLinkUpdateQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ - InsertURLForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.query import \ - LookupURLForDataSourcesSyncQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ - LookupURLForDataSourcesSyncResponse -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ - UpdateURLForDataSourcesSyncParams -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic - - -class UpsertURLsFromDataSourcesDBRequester: - - def __init__(self, session: AsyncSession): - self.session = session - - - async def add_new_urls( - self, - params: list[InsertURLForDataSourcesSyncParams] - ): - url_ids = await sh.bulk_insert( - session=self.session, - models=params, - return_ids=True - ) - results = [] - for insert_param, url_id in zip(params, url_ids): - results.append( - URLMapping( - url=insert_param.url, - url_id=url_id, - ) - ) - return results - - async def lookup_urls( - self, - urls: list[str], - ) -> list[LookupURLForDataSourcesSyncResponse]: - """Lookup URLs for data source sync-relevant information.""" - builder = LookupURLForDataSourcesSyncQueryBuilder(urls=urls) - return await builder.run(session=self.session) - - async def update_existing_urls( - self, - params: list[UpdateURLForDataSourcesSyncParams], - ) -> None: - await sh.bulk_update(session=self.session, models=params) - - async def add_new_data_sources( - self, - params: list[URLDataSourcePydantic] - ) -> None: - await sh.bulk_insert(session=self.session, models=params) - - async def add_new_agency_links( - self, - params: list[LinkURLAgencyPydantic] - ): - await sh.bulk_insert(session=self.session, models=params) - - async def update_agency_links( - self, - params: list[UpdateLinkURLAgencyParams] - ) -> None: - """Overwrite existing url_agency links with new ones, if applicable.""" - query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(self.session) - - async def upsert_validated_flags(self, flags: list[FlagURLValidatedPydantic]) -> None: - await sh.bulk_upsert(self.session, models=flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py deleted file mode 100644 index 50b8e586..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py +++ /dev/null @@ -1,18 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.templates.markers.bulk.insert import BulkInsertableModel - - -class InsertURLForDataSourcesSyncParams(BulkInsertableModel): - url: str - name: str - description: str | None - status: URLStatus - record_type: RecordType - source: URLSource = URLSource.DATA_SOURCES - - @classmethod - def sa_model(cls) -> type[URL]: - return URL \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/format.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/format.py deleted file mode 100644 index 027cf3c3..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/format.py +++ /dev/null @@ -1,7 +0,0 @@ - - - -def format_agency_ids_result(agency_ids: list[int | None]) -> list[int]: - if agency_ids == [None]: - return [] - return agency_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py deleted file mode 100644 index d77be0ab..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py +++ /dev/null @@ -1,62 +0,0 @@ -from sqlalchemy import func, select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.format import format_agency_ids_result -from src.db.helpers.session import session_helper as sh -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ - LookupURLForDataSourcesSyncResponse, URLDataSyncInfo -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.db.queries.base.builder import QueryBuilderBase - - -class LookupURLForDataSourcesSyncQueryBuilder(QueryBuilderBase): - """Look up provided URLs for corresponding database entries.""" - - def __init__(self, urls: list[str]): - super().__init__() - self.urls = urls - - async def run(self, session: AsyncSession) -> list[LookupURLForDataSourcesSyncResponse]: - url_id_label = "url_id" - data_source_id_label = "data_source_id" - agency_ids_label = "agency_ids" - - query = ( - select( - URL.url, - URL.id.label(url_id_label), - URLDataSource.data_source_id.label(data_source_id_label), - func.json_agg(LinkURLAgency.agency_id).label(agency_ids_label) - ).select_from(URL) - .outerjoin(URLDataSource) - .outerjoin(LinkURLAgency) - .where( - URL.url.in_( - self.urls - ) - ) - .group_by( - URL.url, - URL.id, - URLDataSource.data_source_id - ) - ) - - db_results = await sh.mappings(session=session, query=query) - - final_results = [] - for db_result in db_results: - final_results.append( - LookupURLForDataSourcesSyncResponse( - data_source_id=db_result[data_source_id_label], - url_info=URLDataSyncInfo( - url=db_result["url"], - url_id=db_result[url_id_label], - agency_ids=format_agency_ids_result(db_result[agency_ids_label]) - ) - ) - ) - - return final_results diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/response.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/response.py deleted file mode 100644 index 845a6589..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/response.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import BaseModel - -class URLDataSyncInfo(BaseModel): - url: str - url_id: int - agency_ids: list[int] - -class LookupURLForDataSourcesSyncResponse(BaseModel): - data_source_id: int | None - url_info: URLDataSyncInfo | None diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/__init__.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py deleted file mode 100644 index c8d20afb..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py +++ /dev/null @@ -1,21 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.templates.markers.bulk.update import BulkUpdatableModel - - -class UpdateURLForDataSourcesSyncParams(BulkUpdatableModel): - - @classmethod - def id_field(cls) -> str: - return "id" - - @classmethod - def sa_model(cls) -> type[URL]: - return URL - - id: int - name: str - description: str | None - status: URLStatus - record_type: RecordType diff --git a/src/core/tasks/scheduled/impl/sync/exceptions.py b/src/core/tasks/scheduled/impl/sync/exceptions.py deleted file mode 100644 index 0af9937f..00000000 --- a/src/core/tasks/scheduled/impl/sync/exceptions.py +++ /dev/null @@ -1,5 +0,0 @@ - - - -class MaxRequestsExceededError(Exception): - pass \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index da3a6e4b..88cdde20 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -8,8 +8,6 @@ from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.run_url_tasks.operator import RunURLTasksTaskOperator -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.hub.client import HuggingFaceHubClient @@ -69,22 +67,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.DAILY.value, enabled=self.env.bool("DELETE_OLD_LOGS_TASK_FLAG", default=True) ), - ScheduledTaskEntry( - operator=SyncDataSourcesTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ), - interval_minutes=IntervalEnum.DAILY.value, - enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) - ), - ScheduledTaskEntry( - operator=SyncAgenciesTaskOperator( - adb_client=self.async_core.adb_client, - pdap_client=self.pdap_client - ), - interval_minutes=IntervalEnum.DAILY.value, - enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) - ), ScheduledTaskEntry( operator=RunURLTasksTaskOperator(async_core=self.async_core), interval_minutes=self.env.int( diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index b81d641a..00993798 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -22,6 +22,7 @@ from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient @@ -200,6 +201,18 @@ def _get_location_id_task_operator(self) -> URLTaskEntry: ) ) + def _get_auto_validate_task_operator(self) -> URLTaskEntry: + operator = AutoValidateURLTaskOperator( + adb_client=self.adb_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.env.bool( + "URL_AUTO_VALIDATE_TASK_FLAG", + default=True + ) + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ @@ -213,5 +226,6 @@ async def load_entries(self) -> list[URLTaskEntry]: self._get_submit_approved_url_task_operator(), self._get_url_auto_relevance_task_operator(), self._get_url_screenshot_task_operator(), - self._get_location_id_task_operator() + self._get_location_id_task_operator(), + self._get_auto_validate_task_operator() ] diff --git a/src/core/tasks/url/operators/submit_approved/queries/cte.py b/src/core/tasks/url/operators/submit_approved/queries/cte.py index 1ef5617f..5d883429 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/cte.py +++ b/src/core/tasks/url/operators/submit_approved/queries/cte.py @@ -15,6 +15,7 @@ ) .where( URL.status == URLStatus.OK, + URL.name.isnot(None), FlagURLValidated.type == URLType.DATA_SOURCE, ~exists().where( URLDataSource.url_id == URL.id diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 2da731bd..d4138f9a 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -33,7 +33,8 @@ async def _build_query(): .options( selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.optional_data_source_metadata), selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.confirmed_agencies), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.reviewing_user) + selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.reviewing_user), + selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.record_type), ).limit(100) ) return query @@ -58,7 +59,7 @@ async def _process_result(url: URL) -> SubmitApprovedURLTDO: name=url.name, agency_ids=agency_ids, description=url.description, - record_type=url.record_type, + record_type=url.record_type.record_type, record_formats=record_formats, data_portal_type=data_portal_type, supplying_entity=supplying_entity, diff --git a/src/core/tasks/url/operators/validate/core.py b/src/core/tasks/url/operators/validate/core.py index 23ca00c1..9d8aa5af 100644 --- a/src/core/tasks/url/operators/validate/core.py +++ b/src/core/tasks/url/operators/validate/core.py @@ -1,4 +1,8 @@ from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.validate.queries.get.core import GetURLsForAutoValidationQueryBuilder +from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.core.tasks.url.operators.validate.queries.insert import InsertURLAutoValidationsQueryBuilder +from src.core.tasks.url.operators.validate.queries.prereq.core import AutoValidatePrerequisitesQueryBuilder from src.db.enums import TaskType @@ -9,15 +13,18 @@ def task_type(self) -> TaskType: return TaskType.AUTO_VALIDATE async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.adb_client.run_query_builder( + AutoValidatePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - # TODO (SM422): Implement - # Get URLs for auto validation - - # Link - - # Add Validation Objects (Flag and ValidationType) - - raise NotImplementedError \ No newline at end of file + responses: list[GetURLsForAutoValidationResponse] = await self.adb_client.run_query_builder( + GetURLsForAutoValidationQueryBuilder() + ) + url_ids: list[int] = [response.url_id for response in responses] + await self.link_urls_to_task(url_ids) + + await self.adb_client.run_query_builder( + InsertURLAutoValidationsQueryBuilder(responses) + ) diff --git a/src/core/tasks/url/operators/validate/queries/cte.py b/src/core/tasks/url/operators/validate/queries/cte.py deleted file mode 100644 index 3421977b..00000000 --- a/src/core/tasks/url/operators/validate/queries/cte.py +++ /dev/null @@ -1,8 +0,0 @@ - - -class AutoValidatedTaskOperatorPrerequisitesCTEContainer: - - def __init__(self): - self._query = ( - # TODO: Complete - ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/__init__.py b/src/core/tasks/url/operators/validate/queries/ctes/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/__init__.py rename to src/core/tasks/url/operators/validate/queries/ctes/__init__.py diff --git a/src/core/tasks/scheduled/impl/sync/agency/__init__.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/__init__.py rename to src/core/tasks/url/operators/validate/queries/ctes/consensus/__init__.py diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/base.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/base.py new file mode 100644 index 00000000..7a85df9c --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/base.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod + +from sqlalchemy import Column, CTE + + +class ValidationCTEContainer: + _query: CTE + + @property + def url_id(self) -> Column[int]: + return self._query.c.url_id + + @property + def query(self) -> CTE: + return self._query \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/helper.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/helper.py new file mode 100644 index 00000000..6078e5bb --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/helper.py @@ -0,0 +1,17 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.validate.queries.ctes.scored import ScoredCTEContainer + + +def build_validation_query( + scored_cte: ScoredCTEContainer, + label: str +) -> CTE: + return select( + scored_cte.url_id, + scored_cte.entity.label(label) + ).where( + scored_cte.max_votes >= 2, + scored_cte.votes == scored_cte.max_votes, + scored_cte.num_labels_with_that_vote == 1 + ).cte(f"{label}_validation") diff --git a/src/core/tasks/scheduled/impl/sync/agency/dtos/__init__.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/dtos/__init__.py rename to src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/__init__.py diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/agency.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/agency.py new file mode 100644 index 00000000..b5b5ee63 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/agency.py @@ -0,0 +1,24 @@ +from sqlalchemy import select, Column + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.base import ValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.helper import build_validation_query +from src.core.tasks.url.operators.validate.queries.ctes.counts.impl.agency import AGENCY_VALIDATION_COUNTS_CTE +from src.core.tasks.url.operators.validate.queries.ctes.scored import ScoredCTEContainer + + +class AgencyValidationCTEContainer(ValidationCTEContainer): + + def __init__(self): + _scored = ScoredCTEContainer( + AGENCY_VALIDATION_COUNTS_CTE + ) + + self._query = build_validation_query( + _scored, + "agency_id" + ) + + + @property + def agency_id(self) -> Column[int]: + return self._query.c.agency_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/location.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/location.py new file mode 100644 index 00000000..29951968 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/location.py @@ -0,0 +1,23 @@ +from sqlalchemy import Column + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.base import ValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.helper import build_validation_query +from src.core.tasks.url.operators.validate.queries.ctes.counts.impl.location import LOCATION_VALIDATION_COUNTS_CTE +from src.core.tasks.url.operators.validate.queries.ctes.scored import ScoredCTEContainer + + +class LocationValidationCTEContainer(ValidationCTEContainer): + + def __init__(self): + _scored = ScoredCTEContainer( + LOCATION_VALIDATION_COUNTS_CTE + ) + + self._query = build_validation_query( + _scored, + "location_id" + ) + + @property + def location_id(self) -> Column[int]: + return self._query.c.location_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/record_type.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/record_type.py new file mode 100644 index 00000000..befb0c7e --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/record_type.py @@ -0,0 +1,24 @@ +from sqlalchemy import select, Column + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.base import ValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.helper import build_validation_query +from src.core.tasks.url.operators.validate.queries.ctes.counts.impl.record_type import RECORD_TYPE_COUNTS_CTE +from src.core.tasks.url.operators.validate.queries.ctes.scored import ScoredCTEContainer + + +class RecordTypeValidationCTEContainer(ValidationCTEContainer): + + def __init__(self): + + _scored = ScoredCTEContainer( + RECORD_TYPE_COUNTS_CTE + ) + + self._query = build_validation_query( + _scored, + "record_type" + ) + + @property + def record_type(self) -> Column[str]: + return self._query.c.record_type \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/url_type.py b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/url_type.py new file mode 100644 index 00000000..4d4ec750 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/consensus/impl/url_type.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, Column + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.base import ValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.helper import build_validation_query +from src.core.tasks.url.operators.validate.queries.ctes.counts.impl.url_type import URL_TYPES_VALIDATION_COUNTS_CTE +from src.core.tasks.url.operators.validate.queries.ctes.scored import ScoredCTEContainer + + +class URLTypeValidationCTEContainer(ValidationCTEContainer): + + def __init__(self): + _scored = ScoredCTEContainer( + URL_TYPES_VALIDATION_COUNTS_CTE + ) + + self._query = build_validation_query( + _scored, + "url_type" + ) + + @property + def url_type(self) -> Column[str]: + return self._query.c.url_type \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/__init__.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/__init__.py rename to src/core/tasks/url/operators/validate/queries/ctes/counts/__init__.py diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/core.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/core.py new file mode 100644 index 00000000..af7e97b4 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/core.py @@ -0,0 +1,23 @@ +from sqlalchemy import CTE, Column + + +class ValidatedCountsCTEContainer: + + def __init__(self, cte: CTE): + self._cte: CTE = cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c.url_id + + @property + def entity(self) -> Column: + return self._cte.c.entity + + @property + def votes(self) -> Column[int]: + return self._cte.c.votes \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py rename to src/core/tasks/url/operators/validate/queries/ctes/counts/impl/__init__.py diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py new file mode 100644 index 00000000..e9df9db4 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py @@ -0,0 +1,24 @@ +from sqlalchemy import select, func + +from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.views.unvalidated_url import UnvalidatedURL + +AGENCY_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( + ( + select( + UserUrlAgencySuggestion.url_id, + UserUrlAgencySuggestion.agency_id.label("entity"), + func.count().label("votes") + ) + .join( + UnvalidatedURL, + UserUrlAgencySuggestion.url_id == UnvalidatedURL.url_id + ) + .group_by( + UserUrlAgencySuggestion.url_id, + UserUrlAgencySuggestion.agency_id + ) + .cte("counts_agency") + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py new file mode 100644 index 00000000..2ef385cc --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py @@ -0,0 +1,24 @@ +from sqlalchemy import select, func + +from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.views.unvalidated_url import UnvalidatedURL + +LOCATION_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( + ( + select( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id.label("entity"), + func.count().label("votes") + ) + .join( + UnvalidatedURL, + UserLocationSuggestion.url_id == UnvalidatedURL.url_id + ) + .group_by( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id + ) + .cte("counts_location") + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py new file mode 100644 index 00000000..6300ec92 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py @@ -0,0 +1,24 @@ +from sqlalchemy import select, func + +from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.views.unvalidated_url import UnvalidatedURL + +RECORD_TYPE_COUNTS_CTE = ValidatedCountsCTEContainer( + ( + select( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type.label("entity"), + func.count().label("votes") + ) + .join( + UnvalidatedURL, + UserRecordTypeSuggestion.url_id == UnvalidatedURL.url_id + ) + .group_by( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type + ) + .cte("counts_record_type") + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py new file mode 100644 index 00000000..0e3de946 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, func + +from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.views.unvalidated_url import UnvalidatedURL + +URL_TYPES_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( + ( + select( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type.label("entity"), + func.count().label("votes") + ) + .join( + UnvalidatedURL, + UserURLTypeSuggestion.url_id == UnvalidatedURL.url_id + ) + .group_by( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type + ) + .cte("counts_url_type") + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/scored.py b/src/core/tasks/url/operators/validate/queries/ctes/scored.py new file mode 100644 index 00000000..557e38ea --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/scored.py @@ -0,0 +1,52 @@ +from sqlalchemy import CTE, select, func, Column + +from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer + + +class ScoredCTEContainer: + + def __init__( + self, + counts_cte: ValidatedCountsCTEContainer + ): + self._cte: CTE = ( + select( + counts_cte.url_id, + counts_cte.entity, + counts_cte.votes, + func.max(counts_cte.votes).over( + partition_by=counts_cte.url_id + ).label("max_votes"), + func.count().over( + partition_by=( + counts_cte.url_id, + counts_cte.votes + ) + ).label("num_labels_with_that_vote") + ) + .cte(f"scored_{counts_cte.cte.name}") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c.url_id + + @property + def entity(self) -> Column: + return self._cte.c.entity + + @property + def votes(self) -> Column[int]: + return self._cte.c.votes + + @property + def max_votes(self) -> Column[int]: + return self._cte.c.max_votes + + @property + def num_labels_with_that_vote(self) -> Column[int]: + return self._cte.c.num_labels_with_that_vote \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/get/core.py b/src/core/tasks/url/operators/validate/queries/get/core.py index aad27236..f361912e 100644 --- a/src/core/tasks/url/operators/validate/queries/get/core.py +++ b/src/core/tasks/url/operators/validate/queries/get/core.py @@ -1,8 +1,17 @@ -from typing import Any +from typing import Sequence -from sqlalchemy import select +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.exceptions import FailedValidationException +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.agency import AgencyValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.location import LocationValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.record_type import \ + RecordTypeValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.url_type import URLTypeValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.core.tasks.url.operators.validate.queries.helper import add_where_condition +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -10,11 +19,52 @@ class GetURLsForAutoValidationQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> Any: - # TODO (SM422): Implement + async def run(self, session: AsyncSession) -> list[GetURLsForAutoValidationResponse]: + agency = AgencyValidationCTEContainer() + location = LocationValidationCTEContainer() + url_type = URLTypeValidationCTEContainer() + record_type = RecordTypeValidationCTEContainer() query = ( select( - URL.id + URL.id.label("url_id"), + location.location_id, + agency.agency_id, + url_type.url_type, + record_type.record_type, ) - ) \ No newline at end of file + .outerjoin( + agency.query, + URL.id == agency.url_id, + ) + .outerjoin( + location.query, + URL.id == location.url_id, + ) + .outerjoin( + url_type.query, + URL.id == url_type.url_id, + ) + .outerjoin( + record_type.query, + URL.id == record_type.url_id, + ) + ) + query = add_where_condition( + query, + agency=agency, + location=location, + url_type=url_type, + record_type=record_type, + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + responses: list[GetURLsForAutoValidationResponse] = [] + for mapping in mappings: + try: + response = GetURLsForAutoValidationResponse(**mapping) + responses.append(response) + except FailedValidationException as e: + raise FailedValidationException( + f"Failed to validate URL {mapping['url_id']}") from e + return responses diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py b/src/core/tasks/url/operators/validate/queries/get/models/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py rename to src/core/tasks/url/operators/validate/queries/get/models/__init__.py diff --git a/src/core/tasks/url/operators/validate/queries/get/models/response.py b/src/core/tasks/url/operators/validate/queries/get/models/response.py new file mode 100644 index 00000000..b91dc64c --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/get/models/response.py @@ -0,0 +1,75 @@ +from pydantic import BaseModel, model_validator + +from src.core.enums import RecordType +from src.core.exceptions import FailedValidationException +from src.db.models.impl.flag.url_validated.enums import URLType + + +class GetURLsForAutoValidationResponse(BaseModel): + url_id: int + location_id: int | None + agency_id: int | None + url_type: URLType + record_type: RecordType | None + + @model_validator(mode="after") + def forbid_record_type_if_not_data_source(self): + if self.url_type == URLType.DATA_SOURCE: + return self + if self.record_type is not None: + raise FailedValidationException("record_type must be None if suggested_status is META_URL") + return self + + + @model_validator(mode="after") + def require_record_type_if_data_source(self): + if self.url_type == URLType.DATA_SOURCE and self.record_type is None: + raise FailedValidationException("record_type must be provided if suggested_status is DATA_SOURCE") + return self + + @model_validator(mode="after") + def require_location_if_relevant(self): + if self.url_type not in [ + URLType.META_URL, + URLType.DATA_SOURCE, + URLType.INDIVIDUAL_RECORD, + ]: + return self + if self.location_id is None: + raise FailedValidationException("location_id must be provided if suggested_status is META_URL or DATA_SOURCE") + return self + + + @model_validator(mode="after") + def require_agency_id_if_relevant(self): + if self.url_type not in [ + URLType.META_URL, + URLType.DATA_SOURCE, + URLType.INDIVIDUAL_RECORD, + ]: + return self + if self.agency_id is None: + raise FailedValidationException("agency_id must be provided if suggested_status is META_URL or DATA_SOURCE") + return self + + @model_validator(mode="after") + def forbid_all_else_if_not_relevant(self): + if self.url_type != URLType.NOT_RELEVANT: + return self + if self.record_type is not None: + raise FailedValidationException("record_type must be None if suggested_status is NOT RELEVANT") + if self.agency_id is not None: + raise FailedValidationException("agency_ids must be empty if suggested_status is NOT RELEVANT") + if self.location_id is not None: + raise FailedValidationException("location_ids must be empty if suggested_status is NOT RELEVANT") + return self + + + @model_validator(mode="after") + def deprecate_agency_meta_url_record_type(self): + if self.record_type is None: + return self + if self.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META: + raise FailedValidationException("Contact Info & Agency Meta Record Type is Deprecated.") + return self + diff --git a/src/core/tasks/url/operators/validate/queries/helper.py b/src/core/tasks/url/operators/validate/queries/helper.py new file mode 100644 index 00000000..25128fbe --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/helper.py @@ -0,0 +1,39 @@ +from sqlalchemy import Select, or_, and_ + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.agency import AgencyValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.location import LocationValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.record_type import \ + RecordTypeValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.url_type import URLTypeValidationCTEContainer +from src.db.models.impl.flag.url_validated.enums import URLType + + +def add_where_condition( + query: Select, + agency: AgencyValidationCTEContainer, + location: LocationValidationCTEContainer, + url_type: URLTypeValidationCTEContainer, + record_type: RecordTypeValidationCTEContainer +) -> Select: + return ( + query + .where( + url_type.url_type.isnot(None), + or_( + and_( + url_type.url_type == URLType.DATA_SOURCE.value, + agency.agency_id.isnot(None), + location.location_id.isnot(None), + record_type.record_type.isnot(None), + ), + and_( + url_type.url_type.in_( + (URLType.META_URL.value, URLType.INDIVIDUAL_RECORD.value) + ), + agency.agency_id.isnot(None), + location.location_id.isnot(None), + ), + url_type.url_type == URLType.NOT_RELEVANT.value + ), + ) + ) diff --git a/src/core/tasks/url/operators/validate/queries/insert.py b/src/core/tasks/url/operators/validate/queries/insert.py new file mode 100644 index 00000000..006f23cd --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/insert.py @@ -0,0 +1,59 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.db.models.impl.flag.auto_validated.pydantic import FlagURLAutoValidatedPydantic +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class InsertURLAutoValidationsQueryBuilder(QueryBuilderBase): + + def __init__(self, responses: list[GetURLsForAutoValidationResponse]): + super().__init__() + self._responses = responses + + async def run(self, session: AsyncSession) -> Any: + url_record_types: list[URLRecordTypePydantic] = [] + link_url_agencies: list[LinkURLAgencyPydantic] = [] + url_validated_flags: list[FlagURLValidatedPydantic] = [] + url_auto_validated_flags: list[FlagURLAutoValidatedPydantic] = [] + + for response in self._responses: + if response.agency_id is not None: + link_url_agency: LinkURLAgencyPydantic = LinkURLAgencyPydantic( + url_id=response.url_id, + agency_id=response.agency_id + ) + link_url_agencies.append(link_url_agency) + + if response.record_type is not None: + url_record_type: URLRecordTypePydantic = URLRecordTypePydantic( + url_id=response.url_id, + record_type=response.record_type + ) + url_record_types.append(url_record_type) + + url_validated_flag: FlagURLValidatedPydantic = FlagURLValidatedPydantic( + url_id=response.url_id, + type=response.url_type + ) + url_validated_flags.append(url_validated_flag) + + url_auto_validated_flag: FlagURLAutoValidatedPydantic = FlagURLAutoValidatedPydantic( + url_id=response.url_id, + ) + url_auto_validated_flags.append(url_auto_validated_flag) + + for inserts in [ + link_url_agencies, + url_record_types, + url_validated_flags, + url_auto_validated_flags, + ]: + await sh.bulk_insert(session, models=inserts) + + diff --git a/src/core/tasks/url/operators/validate/queries/prereq/core.py b/src/core/tasks/url/operators/validate/queries/prereq/core.py new file mode 100644 index 00000000..7c9a9684 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/prereq/core.py @@ -0,0 +1,64 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.agency import AgencyValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.location import LocationValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.record_type import \ + RecordTypeValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.ctes.consensus.impl.url_type import URLTypeValidationCTEContainer +from src.core.tasks.url.operators.validate.queries.helper import add_where_condition +from src.db.helpers.session import session_helper as sh +from src.db.models.views.unvalidated_url import UnvalidatedURL +from src.db.queries.base.builder import QueryBuilderBase + + +class AutoValidatePrerequisitesQueryBuilder(QueryBuilderBase): + """ + Checks to see if any URL meets any of the following prerequisites + - Is a DATA SOURCE URL with consensus on all fields + - Is a META URL with consensus on url_type, agency, and location fields + - Is a NOT RELEVANT or SINGLE PAGE URL with consensus on url_type + """ + + async def run(self, session: AsyncSession) -> bool: + agency = AgencyValidationCTEContainer() + location = LocationValidationCTEContainer() + url_type = URLTypeValidationCTEContainer() + record_type = RecordTypeValidationCTEContainer() + + + query = ( + select( + UnvalidatedURL.url_id, + ) + .select_from( + UnvalidatedURL + ) + .outerjoin( + agency.query, + UnvalidatedURL.url_id == agency.url_id, + ) + .outerjoin( + location.query, + UnvalidatedURL.url_id == location.url_id, + ) + .outerjoin( + url_type.query, + UnvalidatedURL.url_id == url_type.url_id, + ) + .outerjoin( + record_type.query, + UnvalidatedURL.url_id == record_type.url_id, + ) + ) + query = add_where_condition( + query, + agency=agency, + location=location, + url_type=url_type, + record_type=record_type, + ).limit(1) + + return await sh.results_exist(session, query=query) + + diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 2e186f7c..18ac2a29 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -47,19 +47,6 @@ from src.core.enums import BatchStatus, RecordType from src.core.env_var_manager import EnvVarManager from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder -from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query -from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ - get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ - GetDataSourcesSyncParametersQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query -from src.core.tasks.scheduled.impl.sync.data_sources.queries.update_sync_progress import \ - get_update_data_sources_sync_progress_query -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.core import \ - UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder @@ -131,7 +118,6 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo class AsyncDatabaseClient: @@ -1103,38 +1089,6 @@ async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi async def get_urls_aggregated_pending_metrics(self): return await self.run_query_builder(GetMetricsURLSAggregatedPendingQueryBuilder()) - async def get_agencies_sync_parameters(self) -> AgencySyncParameters: - return await self.run_query_builder( - GetAgenciesSyncParametersQueryBuilder() - ) - - async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: - return await self.run_query_builder( - GetDataSourcesSyncParametersQueryBuilder() - ) - - async def upsert_urls_from_data_sources( - self, - data_sources: list[DataSourcesSyncResponseInnerInfo] - ) -> None: - await self.run_query_builder( - UpsertURLsFromDataSourcesQueryBuilder( - sync_infos=data_sources - ) - ) - - async def update_agencies_sync_progress(self, page: int) -> None: - await self.execute(get_update_agencies_sync_progress_query(page)) - - async def update_data_sources_sync_progress(self, page: int) -> None: - await self.execute(get_update_data_sources_sync_progress_query(page)) - - async def mark_full_data_sources_sync(self) -> None: - await self.execute(get_mark_full_data_sources_sync_query()) - - async def mark_full_agencies_sync(self) -> None: - await self.execute(get_mark_full_agencies_sync_query()) - @session_manager async def get_html_for_url( self, diff --git a/src/db/models/impl/state/sync/__init__.py b/src/db/models/impl/state/sync/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/db/models/impl/state/sync/agencies.py b/src/db/models/impl/state/sync/agencies.py deleted file mode 100644 index 7ee1babe..00000000 --- a/src/db/models/impl/state/sync/agencies.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Tracks the status of the agencies sync -""" - -from sqlalchemy import DateTime, Date, Integer, Column - -from src.db.models.templates_.base import Base - - -class AgenciesSyncState(Base): - __tablename__ = 'agencies_sync_state' - id = Column(Integer, primary_key=True) - last_full_sync_at = Column( - DateTime(), - nullable=True, - comment="The datetime of the last *full* sync " - "(i.e., the last sync that got all entries " - "available to be synchronized)." - ) - current_cutoff_date = Column( - Date(), - nullable=True, - comment="Tracks the cutoff date passed to the agencies sync endpoint." - "On completion of a full sync, this is set to " - "the day before the present day." - ) - current_page = Column( - Integer(), - nullable=True, - comment="Tracks the current page passed to the agencies sync endpoint." - "On completion of a full sync, this is set to `null`." - ) \ No newline at end of file diff --git a/src/db/models/impl/state/sync/data_sources.py b/src/db/models/impl/state/sync/data_sources.py deleted file mode 100644 index 333d0945..00000000 --- a/src/db/models/impl/state/sync/data_sources.py +++ /dev/null @@ -1,28 +0,0 @@ -from sqlalchemy import Integer, Column, DateTime, Date - -from src.db.models.templates_.base import Base - - -class DataSourcesSyncState(Base): - __tablename__ = 'data_sources_sync_state' - id = Column(Integer, primary_key=True) - last_full_sync_at = Column( - DateTime(), - nullable=True, - comment="The datetime of the last *full* sync " - "(i.e., the last sync that got all entries " - "available to be synchronized)." - ) - current_cutoff_date = Column( - Date(), - nullable=True, - comment="Tracks the cutoff date passed to the data sources sync endpoint." - "On completion of a full sync, this is set to " - "the day before the present day." - ) - current_page = Column( - Integer(), - nullable=True, - comment="Tracks the current page passed to the data sources sync endpoint." - "On completion of a full sync, this is set to `null`." - ) \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index 18743f1b..f04dd3df 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -17,5 +17,4 @@ def sa_model(cls) -> type[Base]: collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK - record_type: RecordType | None = None source: URLSource \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 6caa216e..fec9de54 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -7,6 +7,7 @@ from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.probed_for_404 import URLProbedFor404 +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin from src.db.models.templates_.with_id import WithIDBase @@ -27,11 +28,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): name='url_status', nullable=False ) - record_type = enum_column( - RecordType, - name='record_type', - nullable=True - ) + source = enum_column( URLSource, name='url_source', @@ -45,6 +42,10 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): back_populates="urls", uselist=False, ) + record_type = relationship( + URLRecordType, + uselist=False, + ) duplicates = relationship("Duplicate", back_populates="original_url") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") error_info = relationship("URLErrorInfo", back_populates="url", cascade="all, delete-orphan") diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py b/src/db/models/impl/url/record_type/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py rename to src/db/models/impl/url/record_type/__init__.py diff --git a/src/db/models/impl/url/record_type/pydantic.py b/src/db/models/impl/url/record_type/pydantic.py new file mode 100644 index 00000000..a45df06c --- /dev/null +++ b/src/db/models/impl/url/record_type/pydantic.py @@ -0,0 +1,20 @@ +from src.core.enums import RecordType +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + + +class URLRecordTypePydantic( + BulkInsertableModel, + BulkUpsertableModel, +): + url_id: int + record_type: RecordType + + @classmethod + def sa_model(cls) -> type[URLRecordType]: + return URLRecordType + + @classmethod + def id_field(cls) -> str: + return "url_id" \ No newline at end of file diff --git a/src/db/models/impl/url/record_type/sqlalchemy.py b/src/db/models/impl/url/record_type/sqlalchemy.py new file mode 100644 index 00000000..7e8f2fac --- /dev/null +++ b/src/db/models/impl/url/record_type/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy.orm import Mapped + +from src.core.enums import RecordType +from src.db.models.helpers import url_id_primary_key_constraint, enum_column +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.templates_.base import Base + + +class URLRecordType( + Base, + CreatedAtMixin, + URLDependentMixin +): + __tablename__ = "url_record_type" + __table_args__ = (url_id_primary_key_constraint(),) + + record_type: Mapped[RecordType] = enum_column(RecordType, name="record_type", nullable=False) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 1e997079..661edf07 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,18 +1,10 @@ -from datetime import date from typing import Any from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ - SearchAgencyByLocationOuterResponse -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo from src.external.pdap.enums import MatchAgencyResponseStatus @@ -154,67 +146,3 @@ async def submit_urls( results.append(response_object) return results - - async def sync_agencies( - self, - params: AgencySyncParameters - ) -> AgenciesSyncResponseInfo: - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=[ - "agencies", - "sync" - ] - ) - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - request_params: dict[str, Any] = { - "page": params.page - } - if params.cutoff_date is not None: - request_params["updated_at"]: date = params.cutoff_date - - request_info = RequestInfo( - type_=RequestType.GET, - url=url, - headers=headers, - params=request_params - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - return AgenciesSyncResponseInfo( - agencies=[ - AgenciesSyncResponseInnerInfo(**entry) - for entry in response_info.data["agencies"] - ] - ) - - async def sync_data_sources( - self, - params: DataSourcesSyncParameters - ) -> DataSourcesSyncResponseInfo: - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=[ - "data-sources", - "sync" - ] - ) - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - params_dict: dict[str, Any] = {"page": params.page} - if params.cutoff_date is not None: - params_dict["updated_at"]: date = params.cutoff_date - - request_info = RequestInfo( - type_=RequestType.GET, - url=url, - headers=headers, - params=params_dict - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - return DataSourcesSyncResponseInfo( - data_sources=[ - DataSourcesSyncResponseInnerInfo(**entry) - for entry in response_info.data["data_sources"] - ] - ) diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 6ac7367c..3ca0db71 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -115,7 +115,7 @@ def task_id_column() -> sa.Column: comment='A foreign key to the `tasks` table.' ) -def url_id_column(name: str = 'url_id') -> sa.Column: +def url_id_column(name: str = 'url_id', primary_key: bool = False) -> sa.Column: return sa.Column( name, sa.Integer(), @@ -123,6 +123,7 @@ def url_id_column(name: str = 'url_id') -> sa.Column: 'urls.id', ondelete='CASCADE' ), + primary_key=primary_key, nullable=False, comment='A foreign key to the `urls` table.' ) diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index c9478111..2483921f 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -11,6 +11,7 @@ from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -56,11 +57,14 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS assert url.status == URLStatus.OK assert url.name == "New Test Name" assert url.description == "New Test Description" + record_types: list[URLRecordType] = await adb_client.get_all(URLRecordType) + assert len(record_types) == 1 + assert record_types[0].record_type == RecordType.ARREST_RECORDS + optional_metadata = await adb_client.get_all(URLOptionalDataSourceMetadata) assert len(optional_metadata) == 1 assert optional_metadata[0].data_portal_type == "New Test Data Portal Type" diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 1d2e595d..dae5ee4f 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -94,7 +94,7 @@ def check_link(link: LinkBatchURL): def check_url(url: URL, url_only: bool): assert url.url is not None - other_attributes = ["name", "description", "collector_metadata", "record_type"] + other_attributes = ["name", "description", "collector_metadata"] return check_attributes(url, other_attributes, url_only) diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 62f215fb..c9eb62b1 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -7,6 +7,7 @@ from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review from tests.helpers.data_creator.core import DBDataCreator @@ -42,11 +43,14 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.record_type == RecordType.ARREST_RECORDS assert url.status == URLStatus.OK assert url.name == "Test Name" assert url.description == "Test Description" + record_types: list[URLRecordType] = await adb_client.get_all(URLRecordType) + assert len(record_types) == 1 + assert record_types[0].record_type == RecordType.ARREST_RECORDS + # Confirm presence of validated flag validated_flags: list[FlagURLValidated] = await adb_client.get_all(FlagURLValidated) assert len(validated_flags) == 1 diff --git a/tests/automated/integration/db/client/approve_url/test_error.py b/tests/automated/integration/db/client/approve_url/test_error.py index 9523a16c..352e737a 100644 --- a/tests/automated/integration/db/client/approve_url/test_error.py +++ b/tests/automated/integration/db/client/approve_url/test_error.py @@ -30,7 +30,6 @@ async def test_approval_url_error(db_data_creator: DBDataCreator): # Create kwarg dictionary with all required approval info fields kwarg_dict = { - "record_type": RecordType.ARREST_RECORDS, "agency_ids": [await db_data_creator.agency()], "name": "Test Name", } diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 05b829df..417677df 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -5,6 +5,7 @@ from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import get_test_url, get_test_html @@ -39,11 +40,15 @@ async def run(self, session: AsyncSession) -> list[int]: status=URLStatus.OK, name=name, description=description, - record_type=self.inp.record_type, source=URLSource.COLLECTOR ) session.add(url) await session.flush() + record_type = URLRecordType( + url_id=url.id, + record_type=self.inp.record_type, + ) + session.add(record_type) url_ids.append(url.id) if self.inp.status in ( PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py deleted file mode 100644 index 85b9f1bc..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py +++ /dev/null @@ -1,30 +0,0 @@ -import pytest_asyncio - -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import update_existing_agencies_updated_at, \ - add_existing_agencies - - -@pytest_asyncio.fixture -async def operator( - adb_client_test: AsyncDatabaseClient, - mock_pdap_client: PDAPClient -) -> SyncAgenciesTaskOperator: - return SyncAgenciesTaskOperator( - adb_client=adb_client_test, - pdap_client=mock_pdap_client - ) - -@pytest_asyncio.fixture -async def setup( - db_data_creator, - operator -) -> SyncAgenciesTaskOperator: - await add_existing_agencies(db_data_creator) - await update_existing_agencies_updated_at(db_data_creator) - - return operator - - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/data.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/data.py deleted file mode 100644 index d3227393..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/data.py +++ /dev/null @@ -1,80 +0,0 @@ -from datetime import datetime - -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo - -PREEXISTING_AGENCY_1 = AgenciesSyncResponseInnerInfo( - display_name="Preexisting Agency 1", - agency_id=1, - state_name="CA", - county_name="San Francisco", - locality_name="San Francisco", - updated_at=datetime(2023, 1, 1, 0, 0, 0) -) - -PREEXISTING_AGENCY_2 = AgenciesSyncResponseInnerInfo( - display_name="Preexisting Agency 2", - agency_id=2, - state_name="NC", - county_name="NC County", - locality_name="NC City", - updated_at=datetime(2025, 10, 17, 3, 0, 0) -) - -PREEXISTING_AGENCIES = [ - PREEXISTING_AGENCY_1, - PREEXISTING_AGENCY_2 -] - -FIRST_CALL_RESPONSE = AgenciesSyncResponseInfo( - agencies=[ - AgenciesSyncResponseInnerInfo( - display_name="New Agency 3", - agency_id=3, - state_name=None, - county_name=None, - locality_name=None, - updated_at=datetime(2022, 3, 5, 7, 6, 9) - ), - AgenciesSyncResponseInnerInfo( - display_name="New Agency 4", - agency_id=4, - state_name="Ohio", - county_name=None, - locality_name=None, - updated_at=datetime(2024, 9, 5, 7, 6, 9) - ), - AgenciesSyncResponseInnerInfo( - display_name="New Agency 5", - agency_id=5, - state_name="AL", - county_name="AL County", - locality_name=None, - updated_at=datetime(2023, 12, 4, 0, 0, 0) - ), - AgenciesSyncResponseInnerInfo( - display_name="New Agency 6", - agency_id=6, - state_name="TX", - county_name="TX County", - locality_name="TX City", - updated_at=datetime(2021, 1, 1, 0, 0, 0) - ), - PREEXISTING_AGENCY_1 - ], -) - -SECOND_CALL_RESPONSE = AgenciesSyncResponseInfo( - agencies=[ - PREEXISTING_AGENCY_2 - ] -) - -THIRD_CALL_RESPONSE = AgenciesSyncResponseInfo( - agencies=[] -) - -AGENCIES_SYNC_RESPONSES = [ - FIRST_CALL_RESPONSE, - SECOND_CALL_RESPONSE, - THIRD_CALL_RESPONSE -] \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py deleted file mode 100644 index a38cbaa6..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py +++ /dev/null @@ -1,27 +0,0 @@ -from src.db.models.impl.agency.sqlalchemy import Agency -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE - - -class AgencyChecker: - """ - Checks if an agency matches expected values - """ - - def __init__(self): - self.dict_ = {} - for response in [FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE]: - for agency in response.agencies: - self.dict_[agency.agency_id] = agency - - def check( - self, - agency: Agency - ): - info: AgenciesSyncResponseInnerInfo = self.dict_.get( - agency.agency_id - ) - assert info.display_name == agency.name - assert info.state_name == agency.state - assert info.county_name == agency.county - assert info.locality_name == agency.locality diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py deleted file mode 100644 index 6b1a8544..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py +++ /dev/null @@ -1,76 +0,0 @@ -from contextlib import contextmanager -from datetime import timedelta -from unittest.mock import patch - -from sqlalchemy import select, func, TIMESTAMP, cast, update - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.state.sync.agencies import AgenciesSyncState -from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import PREEXISTING_AGENCIES - - -async def check_sync_concluded( - db_client: AsyncDatabaseClient, - check_updated_at: bool = True -): - current_db_datetime = await db_client.scalar( - select( - cast(func.now(), TIMESTAMP) - ) - ) - - sync_state_results = await db_client.scalar( - select( - AgenciesSyncState - ) - ) - assert sync_state_results.current_page is None - assert sync_state_results.last_full_sync_at > current_db_datetime - timedelta(minutes=5) - assert sync_state_results.current_cutoff_date > (current_db_datetime - timedelta(days=2)).date() - - if not check_updated_at: - return - - updated_ats = await db_client.scalars( - select( - Agency.updated_at - ) - ) - assert all( - updated_at > current_db_datetime - timedelta(minutes=5) - for updated_at in updated_ats - ) - - -async def update_existing_agencies_updated_at(db_data_creator): - for preexisting_agency in PREEXISTING_AGENCIES: - query = ( - update(Agency) - .where(Agency.agency_id == preexisting_agency.agency_id) - .values(updated_at=preexisting_agency.updated_at) - ) - await db_data_creator.adb_client.execute(query) - -async def add_existing_agencies(db_data_creator): - agencies_to_add = [] - for preexisting_agency in PREEXISTING_AGENCIES: - agency_to_add = Agency( - name=preexisting_agency.display_name, - state=preexisting_agency.state_name, - county=preexisting_agency.county_name, - locality=preexisting_agency.locality_name, - agency_id=preexisting_agency.agency_id, - ) - agencies_to_add.append(agency_to_add) - await db_data_creator.adb_client.add_all(agencies_to_add) - -@contextmanager -def patch_sync_agencies(side_effects: list): - with patch.object( - PDAPClient, - "sync_agencies", - side_effect=side_effects - ): - yield \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py deleted file mode 100644 index 0712d251..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py +++ /dev/null @@ -1,53 +0,0 @@ -from contextlib import contextmanager -from datetime import timedelta, datetime -from unittest.mock import patch, AsyncMock - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.simple_test_data_functions import generate_test_name - - -def set_up_mock_pdap_client_responses( - mock_pdap_client: PDAPClient, - responses: list[AgenciesSyncResponseInfo | Exception] -) -> None: - """ - Modifies: - - pdap_client.sync_agencies - """ - mock_sync_agencies = AsyncMock( - side_effect=responses + [AgenciesSyncResponseInfo(agencies=[])] - ) - mock_pdap_client.sync_agencies = mock_sync_agencies - -async def set_up_urls( - db_data_creator: DBDataCreator, - record_type: RecordType, - validated_type: URLType | None = None, - agency_ids: list[int] | None = None, -) -> list[int]: - """Create 2 Test URLs in database.""" - url_ids: list[int] = await db_data_creator.create_urls(record_type=record_type, count=2) - if validated_type is not None: - await db_data_creator.create_validated_flags(url_ids=url_ids, validation_type=validated_type) - if agency_ids is not None: - await db_data_creator.create_url_agency_links(url_ids=url_ids, agency_ids=agency_ids) - return url_ids - -def set_up_sync_response_info( - agency_id: int, - meta_urls: list[str], -) -> AgenciesSyncResponseInfo: - yesterday = datetime.now() - timedelta(days=1) - return AgenciesSyncResponseInfo(agencies=[AgenciesSyncResponseInnerInfo( - agency_id=agency_id, - meta_urls=meta_urls, - updated_at=yesterday, - state_name=None, - county_name=None, - locality_name=None, - display_name=generate_test_name(agency_id) - )]) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py deleted file mode 100644 index 8cc57cf5..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ - set_up_mock_pdap_client_responses -from tests.helpers.asserts import assert_task_run_success -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_data_sources_url_in_db_not_meta_url_sync( - wiped_database, - operator: SyncAgenciesTaskOperator, - db_data_creator: DBDataCreator -): - """ - In an Agency Sync, a URL validated as a Data Source linked to the agency - should be untouched if the URL is not in the sync response. - """ - db_client: AsyncDatabaseClient = operator.adb_client - - agency_id: int = 1 - - # Create agency - await db_data_creator.create_agency(agency_id) - - # Set up sync response with new meta URL - sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( - agency_id=agency_id, - meta_urls=[ - "https://example.com/meta-url-1", - ] - ) - - # Create additional URL Validated as data source and link to agency - ds_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLType.DATA_SOURCE, - record_type=RecordType.ACCIDENT_REPORTS - ))[0] - ds_url_id: int = ds_url_mapping.url_id - await db_data_creator.create_url_agency_links( - url_ids=[ds_url_id], - agency_ids=[agency_id] - ) - - set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) - - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - await check_sync_concluded(db_client) - - # Confirm one agency in the database - agencies: list[Agency] = await db_client.get_all(Agency) - assert len(agencies) == 1 - - # Confirm 2 URLs in database - urls: list[URL] = await db_client.get_all(URL) - assert len(urls) == 2 - assert set(url.record_type for url in urls) == { - RecordType.CONTACT_INFO_AND_AGENCY_META, - RecordType.ACCIDENT_REPORTS - } - - # Confirm 2 Agency-URL Links - links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) - assert len(links) == 2 - assert all(link.agency_id == 1 for link in links) - assert set(link.url_id for link in links) == set(url.id for url in urls) - - # Confirm 2 Validated Flags with different Validation Types - flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) - assert len(flags) == 2 - assert set(flag.type for flag in flags) == { - URLType.META_URL, - URLType.DATA_SOURCE - } - assert set(flag.url_id for flag in flags) == set(url.id for url in urls) - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py deleted file mode 100644 index 80b338db..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py +++ /dev/null @@ -1,82 +0,0 @@ -import pytest -from sqlalchemy import select - -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.state.sync.agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import FIRST_CALL_RESPONSE, \ - THIRD_CALL_RESPONSE, SECOND_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, check_sync_concluded - - -@pytest.mark.asyncio -async def test_agency_sync_interruption( - setup: SyncAgenciesTaskOperator -): - """ - Simulate interruption that causes it to stop on the second iteration. - Should be able to resume where it left off. - """ - operator = setup - db_client = operator.adb_client - - with patch_sync_agencies( - [FIRST_CALL_RESPONSE, ValueError("test error")] - ): - run_info = await operator.run_task() - assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message - - # Get current updated_ats from database for the 5 recently updated - query = ( - select( - Agency.updated_at - ).order_by( - Agency.updated_at.desc() - ).limit(5) - ) - updated_ats = await db_client.scalars(query) - # Assert all have same value - assert all( - updated_at == updated_ats[0] - for updated_at in updated_ats - ) - initial_updated_at = updated_ats[0] - - # Check sync state results - sync_state_results = await db_client.scalar( - select( - AgenciesSyncState - ) - ) - assert sync_state_results.current_page == 2 - assert sync_state_results.last_full_sync_at is None - assert sync_state_results.current_cutoff_date is None - - with patch_sync_agencies([SECOND_CALL_RESPONSE, THIRD_CALL_RESPONSE]): - await operator.run_task() - - await check_sync_concluded(db_client) - - # Check six entries in database - agencies: list[Agency] = await db_client.scalars(( - select( - Agency - ).order_by( - Agency.updated_at - ) - )) - assert len(agencies) == 6 - - checker = AgencyChecker() - for agency in agencies: - checker.check(agency) - - # Check newly updated agency has distinct updated_at value - assert agencies[-1].updated_at != initial_updated_at - # Check other agencies have same updated_at value - assert all( - agency.updated_at == initial_updated_at - for agency in agencies[:-1] - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py deleted file mode 100644 index 5fe62211..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py +++ /dev/null @@ -1,78 +0,0 @@ -import pytest - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ - set_up_mock_pdap_client_responses -from tests.helpers.asserts import assert_task_run_success -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_meta_url_in_db_not_sync( - wiped_database, - operator: SyncAgenciesTaskOperator, - db_data_creator: DBDataCreator -): - """ - In an Agency Sync, a URL in the DB validated as a Meta URL linked to the agency - but not included in the most recent sync response should be removed as a link - """ - db_client: AsyncDatabaseClient = operator.adb_client - - # Create Meta URL and link to Agency - agency_id: int = 1 - await db_data_creator.create_agency(agency_id) - meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLType.META_URL, - record_type=RecordType.CONTACT_INFO_AND_AGENCY_META - ))[0] - meta_url_id: int = meta_url_mapping.url_id - await db_data_creator.create_url_agency_links( - url_ids=[meta_url_id], - agency_ids=[agency_id] - ) - - # Create Sync Response for agency with no Meta URLs - sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( - agency_id=agency_id, - meta_urls=[] - ) - - set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - await check_sync_concluded(db_client) - - # Confirm one agency in the database - agencies: list[Agency] = await db_client.get_all(Agency) - assert len(agencies) == 1 - - # Confirm 1 URL in database - urls: list[URL] = await db_client.get_all(URL) - assert len(urls) == 1 - assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) - - # Confirm no Agency-URL Links - links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) - assert len(links) == 0 - - # Confirm 1 Validated Flag - flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) - assert len(flags) == 1 - assert all(flag.type == URLType.META_URL for flag in flags) - assert all(flag.url_id == meta_url_id for flag in flags) - - - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py deleted file mode 100644 index 772139f4..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py +++ /dev/null @@ -1,62 +0,0 @@ -from unittest.mock import MagicMock, call - -import pytest -from sqlalchemy import select - -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.models.impl.agency.sqlalchemy import Agency -from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import AGENCIES_SYNC_RESPONSES -from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded, patch_sync_agencies -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_agency_sync_happy_path( - wiped_database, - setup: SyncAgenciesTaskOperator -): - """ - Test behavior of Agency sync where no meta URLs are returned. - """ - operator = setup - db_client = operator.adb_client - - with patch_sync_agencies(AGENCIES_SYNC_RESPONSES): - run_info = await operator.run_task() - assert_task_run_success(run_info) - mock_func: MagicMock = operator.pdap_client.sync_agencies - - mock_func.assert_has_calls( - [ - call( - AgencySyncParameters( - cutoff_date=None, - page=1 - ) - ), - call( - AgencySyncParameters( - cutoff_date=None, - page=2 - ) - ), - call( - AgencySyncParameters( - cutoff_date=None, - page=3 - ) - ) - ] - ) - - await check_sync_concluded(db_client) - - # Check six entries in database - agencies: list[Agency] = await db_client.scalars(select(Agency)) - assert len(agencies) == 6 - - checker = AgencyChecker() - for agency in agencies: - checker.check(agency) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py deleted file mode 100644 index 0db01723..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py +++ /dev/null @@ -1,53 +0,0 @@ -from datetime import datetime -from unittest.mock import AsyncMock - -import pytest -from sqlalchemy import select - -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.state.sync.agencies import AgenciesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import THIRD_CALL_RESPONSE -from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, check_sync_concluded -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_agency_sync_task_no_new_results( - setup: SyncAgenciesTaskOperator -): - operator = setup - db_client = operator.adb_client - - cutoff_date = datetime(2025, 5, 1).date() - - # Add cutoff date to database - await db_client.add( - AgenciesSyncState( - current_cutoff_date=cutoff_date - ) - ) - - with patch_sync_agencies([THIRD_CALL_RESPONSE]): - run_info = await operator.run_task() - assert_task_run_success(run_info) - mock_func: AsyncMock = operator.pdap_client.sync_agencies - mock_func.assert_called_once_with( - AgencySyncParameters( - cutoff_date=cutoff_date, - page=1 - ) - ) - - await check_sync_concluded(db_client, check_updated_at=False) - - # Check two entries in database - agencies: list[Agency] = await db_client.scalars(select(Agency)) - assert len(agencies) == 2 - - # Neither should be updated with new values - checker = AgencyChecker() - for agency in agencies: - checker.check(agency) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py deleted file mode 100644 index 5e63a79d..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py +++ /dev/null @@ -1,77 +0,0 @@ -import pytest - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ - set_up_mock_pdap_client_responses -from tests.helpers.asserts import assert_task_run_success -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_same_meta_url_diff_agency( - wiped_database, - operator: SyncAgenciesTaskOperator, - db_data_creator: DBDataCreator -): - """ - Test that, in the case of a Meta URL already linked with one agency in the DB and - a new sync response with the same Meta URL but linked to a different agency, - the link to the original agency should be untouched while the link to the new agency - should be added. - """ - db_client: AsyncDatabaseClient = operator.adb_client - existing_agency_id: int = 1 - - await db_data_creator.create_agency(existing_agency_id) - meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLType.META_URL, - record_type=RecordType.CONTACT_INFO_AND_AGENCY_META - ))[0] - meta_url_id: int = meta_url_mapping.url_id - await db_data_creator.create_url_agency_links( - url_ids=[meta_url_id], - agency_ids=[existing_agency_id] - ) - - new_agency_id: int = 2 - meta_url: str = meta_url_mapping.url - sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( - agency_id=new_agency_id, - meta_urls=[meta_url] - ) - - set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - await check_sync_concluded(db_client) - - # Confirm two agencies in the database - agencies: list[Agency] = await db_client.get_all(Agency) - assert len(agencies) == 2 - - # Confirm 1 URL in database - urls: list[URL] = await db_client.get_all(URL) - assert len(urls) == 1 - assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) - - # Confirm 2 Agency-URL Links - links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) - assert len(links) == 2 - - # Confirm 2 Validated Flag - flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) - assert len(flags) == 1 - assert all(flag.type == URLType.META_URL for flag in flags) - assert all(flag.url_id == meta_url_id for flag in flags) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py deleted file mode 100644 index 247a2ba0..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py +++ /dev/null @@ -1,67 +0,0 @@ -import pytest - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, \ - check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ - set_up_mock_pdap_client_responses -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_with_meta_url_not_in_database( - wiped_database, - operator: SyncAgenciesTaskOperator -): - """ - In an Agency Sync, a Meta URL included in the sync response - but not present in the DB should be added to the DB with: - - The URLValidationFlag set to `Meta URL` - - The Record Type set to `Contact Info and Agency Meta` - - The link to the agency added - """ - db_client: AsyncDatabaseClient = operator.adb_client - - sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( - agency_id=1, - meta_urls=[ - "https://example.com/meta-url-1", - "https://example.com/meta-url-2", - ] - ) - - set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - await check_sync_concluded(db_client) - - # Confirm one agency in the database - agencies: list[Agency] = await db_client.get_all(Agency) - assert len(agencies) == 1 - - # Confirm 2 URLs in database - urls: list[URL] = await db_client.get_all(URL) - assert len(urls) == 2 - assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) - - # Confirm 2 Agency-URL Links - links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) - assert len(links) == 2 - assert all(link.agency_id == 1 for link in links) - assert set(link.url_id for link in links) == set(url.id for url in urls) - - # Confirm 2 Validated Flags - flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) - assert len(flags) == 2 - assert all(flag.type == URLType.META_URL for flag in flags) - assert set(flag.url_id for flag in flags) == set(url.id for url in urls) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py deleted file mode 100644 index dcc1fc23..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py +++ /dev/null @@ -1,36 +0,0 @@ -from datetime import timedelta, datetime - -from sqlalchemy import select, cast, func, TIMESTAMP - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from src.db.models.impl.url.core.sqlalchemy import URL - - -async def check_sync_concluded( - db_client: AsyncDatabaseClient, - current_db_datetime: datetime, - check_updated_at: bool = True -) -> None: - - sync_state_results = await db_client.scalar( - select( - DataSourcesSyncState - ) - ) - assert sync_state_results.current_page is None - assert sync_state_results.last_full_sync_at > current_db_datetime - timedelta(minutes=5) - assert sync_state_results.current_cutoff_date > (current_db_datetime - timedelta(days=2)).date() - - if not check_updated_at: - return - - updated_ats = await db_client.scalars( - select( - URL.updated_at - ) - ) - assert all( - updated_at > current_db_datetime - timedelta(minutes=5) - for updated_at in updated_ats - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py deleted file mode 100644 index e91461ea..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py +++ /dev/null @@ -1,47 +0,0 @@ -from datetime import datetime - -import pytest_asyncio - -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from src.external.pdap.client import PDAPClient -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest_asyncio.fixture -async def operator( - db_data_creator: DBDataCreator, - mock_pdap_client: PDAPClient -) -> SyncDataSourcesTaskOperator: - return SyncDataSourcesTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - -@pytest_asyncio.fixture -async def current_db_time( - adb_client_test: AsyncDatabaseClient -) -> datetime: - return (await adb_client_test.get_current_database_time()).replace(tzinfo=None) - - -@pytest_asyncio.fixture -async def agency_ids( - adb_client_test: AsyncDatabaseClient -) -> list[int]: - """Creates and returns the ids of 4 agencies""" - agencies: list[Agency] = [] - agency_ids: list[int] = [] - for i in range(4): - agency = Agency( - agency_id=i, - name=f"Test Agency {i}", - state="test_state", - county="test_county", - locality="test_locality" - ) - agency_ids.append(i) - agencies.append(agency) - await adb_client_test.add_all(agencies) - return agency_ids diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py deleted file mode 100644 index 847add04..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py +++ /dev/null @@ -1,88 +0,0 @@ -from contextlib import contextmanager -from datetime import datetime, timedelta -from unittest.mock import patch, create_autospec, AsyncMock - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.url import \ - TestDataSourcesSyncURLSetupQueryBuilder -from tests.helpers.simple_test_data_functions import generate_test_url - - -@contextmanager -def patch_sync_data_sources(side_effects: list): - with patch.object( - PDAPClient, - "sync_data_sources", - side_effect=side_effects - ): - yield - - - -def set_up_mock_pdap_client_responses( - mock_pdap_client: PDAPClient, - responses: list[DataSourcesSyncResponseInfo | Exception] -) -> None: - """ - Modifies: - - pdap_client.sync_data_sources - """ - mock_sync_data_sources = AsyncMock( - side_effect=responses + [DataSourcesSyncResponseInfo(data_sources=[])] - ) - mock_pdap_client.sync_data_sources = mock_sync_data_sources - -async def set_up_urls( - adb_client: AsyncDatabaseClient, - record_type: RecordType, - validated_type: URLType | None = None, - previously_synced: bool = False, -) -> list[int]: - """Creates 2 test URLs.""" - - builder = TestDataSourcesSyncURLSetupQueryBuilder( - record_type=record_type, - validated_type=validated_type, - previously_synced=previously_synced, - ) - - return await adb_client.run_query_builder(builder) - -def _generate_test_data_source_name(i: int) -> str: - return f"Test Data Source {i}" - -def _generate_test_data_source_description(i: int) -> str: - return f"Test Data Source Description {i}" - -def set_up_sync_response_info( - ids: list[int], - record_type: RecordType, - agency_ids: list[int], - approval_status: ApprovalStatus, - ds_url_status: DataSourcesURLStatus, -) -> DataSourcesSyncResponseInfo: - yesterday = datetime.now() - timedelta(days=1) - inner_info_list: list[DataSourcesSyncResponseInnerInfo] = [] - for id_ in ids: - inner_info_list.append( - DataSourcesSyncResponseInnerInfo( - id=id_, - url=generate_test_url(id_), - name=_generate_test_data_source_name(id_), - description=_generate_test_data_source_description(id_), - record_type=record_type, - agency_ids=agency_ids, - approval_status=approval_status, - url_status=ds_url_status, - updated_at=yesterday, - ) - ) - return DataSourcesSyncResponseInfo( - data_sources=inner_info_list, - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py deleted file mode 100644 index 58735685..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py +++ /dev/null @@ -1,59 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic -from src.db.templates.requester import RequesterBase -from tests.helpers.simple_test_data_functions import generate_test_name, generate_test_url - - -class TestDataSourcesSyncURLSetupQueryRequester(RequesterBase): - - async def insert_urls( - self, - record_type: RecordType, - ) -> list[int]: - - insert_models: list[URLInsertModel] = [] - for i in range(2): - url = URLInsertModel( - url=generate_test_url(i), - name=generate_test_name(i), - record_type=record_type, - source=URLSource.COLLECTOR, - ) - insert_models.append(url) - - return await self.session_helper.bulk_insert(self.session, models=insert_models, return_ids=True) - - async def insert_validated_flags( - self, - url_ids: list[int], - validated_type: URLType - ) -> None: - to_insert: list[FlagURLValidatedPydantic] = [] - for url_id in url_ids: - flag = FlagURLValidatedPydantic( - url_id=url_id, - type=validated_type, - ) - to_insert.append(flag) - - await self.session_helper.bulk_insert(self.session, models=to_insert) - - async def insert_data_source_entry( - self, - url_ids: list[int], - ): - to_insert: list[URLDataSourcePydantic] = [ - URLDataSourcePydantic( - url_id=url_id, - data_source_id=url_id, - ) - for url_id in url_ids - ] - - await self.session_helper.bulk_insert(self.session, models=to_insert) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py deleted file mode 100644 index f7ceae61..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.queries.base.builder import QueryBuilderBase -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.requester import \ - TestDataSourcesSyncURLSetupQueryRequester - - -class TestDataSourcesSyncURLSetupQueryBuilder(QueryBuilderBase): - - def __init__( - self, - record_type: RecordType, - validated_type: URLType | None = None, - previously_synced: bool = False, - ): - super().__init__() - self.record_type = record_type - self.validated_type = validated_type - self.previously_synced = previously_synced - - async def run(self, session: AsyncSession) -> list[int]: - requester = TestDataSourcesSyncURLSetupQueryRequester(session=session) - - url_ids: list[int] = await requester.insert_urls(record_type=self.record_type) - - if self.validated_type is not None: - await requester.insert_validated_flags(url_ids=url_ids, validated_type=self.validated_type) - - if self.previously_synced: - await requester.insert_data_source_entry(url_ids=url_ids) - - return url_ids - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py deleted file mode 100644 index da243117..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py +++ /dev/null @@ -1,76 +0,0 @@ -from datetime import datetime - -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ - set_up_mock_pdap_client_responses, set_up_urls - -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - - -@pytest.mark.asyncio -async def test_db_only( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - current_db_time: datetime -): - """ - Test that operator does nothing with entries only in the database, and nothing is returned by the endpoint. - """ - - # Add URLs to database - url_ids: list[int] = await set_up_urls( - adb_client=adb_client_test, - record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, - validated_type=None, - ) - - # Set up pdap client to return nothing - set_up_mock_pdap_client_responses( - operator.pdap_client, - responses=[ - DataSourcesSyncResponseInfo(data_sources=[]) - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Check sync concluded - assert operator.pdap_client.sync_data_sources.call_count == 1 - assert operator.pdap_client.sync_data_sources.call_args[0][0] == DataSourcesSyncParameters( - cutoff_date=None, - page=1 - ) - - # Confirm URLs are unchanged in database - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == len(url_ids) - assert {url.id for url in urls} == set(url_ids) - assert all(url.status == URLStatus.OK for url in urls) - assert all(url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls) - - # Confirm presence of sync status row with cutoff date and last updated at after initial db time - await check_sync_concluded( - adb_client_test, - check_updated_at=False, - current_db_datetime=current_db_time - ) - - # Confirm no validated flags - flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) - assert len(flags) == 0 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py deleted file mode 100644 index 3aa26866..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py +++ /dev/null @@ -1,97 +0,0 @@ -from datetime import datetime - -import pytest -from sqlalchemy import select - -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources, \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - -@pytest.mark.asyncio -async def test_data_sources_sync_interruption( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - current_db_time: datetime, - agency_ids: list[int] -): - """ - Test that in the case of an interruption. - The data sources sync will resume from the last processed page. - """ - - # Set up endpoint to return URLs on page 1, raise error on page 2 - # return URLs on page 2 on the second call, and return nothing on page 3 - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[0, 1], - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - ValueError("test ds sync error"), - set_up_sync_response_info( - ids=[2, 3], - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - DataSourcesSyncResponseInfo( - data_sources=[], - ) - ] - ) - - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm presence of error - assert run_info.outcome == TaskOperatorOutcome.ERROR - assert "test ds sync error" in run_info.message - - # Confirm first URLs added to database - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 2 - - # Confirm sync status updated to page 2 and cutoff date is null - sync_state_results = await adb_client_test.scalar( - select( - DataSourcesSyncState - ) - ) - assert sync_state_results.current_page == 2 - assert sync_state_results.last_full_sync_at is None - assert sync_state_results.current_cutoff_date is None - - # Run operator again - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Confirm second URLs added to database - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 4 - - # Confirm page updated to null and cutoff date updated - sync_state_results = await adb_client_test.scalar( - select( - DataSourcesSyncState - ) - ) - assert sync_state_results.current_page is None - assert sync_state_results.last_full_sync_at is not None - assert sync_state_results.current_cutoff_date is not None diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py deleted file mode 100644 index 2e5eab87..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py +++ /dev/null @@ -1,88 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_meta_url_not_modified( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - agency_ids: list[int], - db_data_creator: DBDataCreator, -): - """ - In a Data Source Sync, a validated Meta URL linked to an agency should be untouched - if the sync response includes that same agency with other Data Sources URL - """ - original_url_ids: list[int] = await set_up_urls( - adb_client=adb_client_test, - record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, - validated_type=URLType.META_URL, - ) - # Link URLs to existing agencies - await db_data_creator.create_url_agency_links( - url_ids=original_url_ids, - agency_ids=agency_ids, - ) - - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[2, 3], - record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Check sync concluded - operator.pdap_client.sync_data_sources.call_count == 2 - - # Confirm presence of 4 URLs in database - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 4 - assert all([url.status == URLStatus.OK for url in urls]) - assert set([url.record_type for url in urls]) == { - RecordType.CONTACT_INFO_AND_AGENCY_META, - RecordType.COMPLAINTS_AND_MISCONDUCT - } - all_url_ids: list[int] = [url.id for url in urls] - # Check that all original URLs are present - assert set(all_url_ids) >= set(original_url_ids) - - links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) - assert len(links) == 16 - assert set(link.url_id for link in links) == set(all_url_ids) - assert set(link.agency_id for link in links) == set(agency_ids) - - # Confirm presence of validated flag - flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) - assert len(flags) == 4 - assert set([flag.type for flag in flags]) == { - URLType.META_URL, - URLType.DATA_SOURCE, - } - assert set(flag.url_id for flag in flags) == set(all_url_ids) - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py deleted file mode 100644 index 0ae831bd..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py +++ /dev/null @@ -1,107 +0,0 @@ -from datetime import datetime, timedelta - -import pytest -from sqlalchemy import select - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - - -@pytest.mark.asyncio -async def test_ds_sync_multiple_calls( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - current_db_time: datetime, - agency_ids: list[int] -): - """ - Test that operator properly handles multiple calls to sync endpoint. - """ - - # Set up endpoint to return URLs on page 1 and 2, and stop on page 3 - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[0, 1], - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - set_up_sync_response_info( - ids=[2, 3], - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - DataSourcesSyncResponseInfo( - data_sources=[], - ) - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - - # Confirm URLs are added to database - urls: list[URL] = await adb_client_test.get_all(URL) - assert all(url.status == URLStatus.OK for url in urls) - assert all(url.record_type == RecordType.ACCIDENT_REPORTS for url in urls) - url_ids: list[int] = [url.id for url in urls] - - # Confirm 3 calls to pdap_client.sync_data_sources - assert operator.pdap_client.sync_data_sources.call_count == 3 - - # Confirm sync status updated - sync_state_results = await adb_client_test.scalar( - select( - DataSourcesSyncState - ) - ) - assert sync_state_results.current_page is None - assert sync_state_results.last_full_sync_at > current_db_time - timedelta(minutes=5) - assert sync_state_results.current_cutoff_date > (current_db_time - timedelta(days=2)).date() - - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - DataSourcesSyncResponseInfo( - data_sources=[], - ) - ] - ) - - # Run operator again - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Confirm no new URLs added - urls: list[URL] = await adb_client_test.get_all(URL) - assert set([url.id for url in urls]) == set(url_ids) - - # Confirm call to pdap_client.sync_data_sources made with cutoff_date - assert operator.pdap_client.sync_data_sources.called_once_with( - DataSourcesSyncParameters( - cutoff_date=sync_state_results.current_cutoff_date, - page=1 - ) - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py deleted file mode 100644 index 9a6bf120..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py +++ /dev/null @@ -1,85 +0,0 @@ -from datetime import datetime - -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - - -@pytest.mark.asyncio -async def test_url_broken_approved( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - agency_ids: list[int], - current_db_time: datetime -): - """ - Test that a data source with - - a broken URL status - - an approved status - Is added to the data source with a 404 Not Found status. - """ - - # Set up pdap client to return url with broken url status but approved - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[0, 1], - record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.BROKEN, - ), - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Check sync concluded - operator.pdap_client.sync_data_sources.call_count == 2 - - # Confirm presence of URL with status of `404 not found` - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 2 - assert all([url.status == URLStatus.NOT_FOUND for url in urls]) - assert all([url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls]) - url_ids: list[int] = [url.id for url in urls] - - # Confirm presence of agencies - links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) - assert len(links) == 8 - assert set(link.url_id for link in links) == set(url_ids) - assert set(link.agency_id for link in links) == set(agency_ids) - - # Confirm presence of validated flag - flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) - assert len(flags) == 2 - assert all([flag.type == URLType.DATA_SOURCE for flag in flags]) - assert set(flag.url_id for flag in flags) == set(url_ids) - - # Confirm presence of sync status row - await check_sync_concluded( - adb_client_test, - current_db_datetime=current_db_time - ) - - - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py deleted file mode 100644 index f305cee4..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py +++ /dev/null @@ -1,94 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - - -@pytest.mark.asyncio -async def test_url_in_db_overwritten_by_ds( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - agency_ids: list[int] -): - """ - Test that a URL in the database is overwritten by a data source with the same URL, - if their information is different. - """ - old_agency_ids: list[int] = agency_ids[:2] - new_agency_ids: list[int] = agency_ids[2:4] - - - # Add URLs to database - url_ids: list[int] = await set_up_urls( - adb_client=adb_client_test, - record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, - validated_type=URLType.DATA_SOURCE, - ) - # Link URLs to 2 existing agencies - links: list[LinkURLAgency] = [] - for url_id in url_ids: - for agency_id in old_agency_ids: - link = LinkURLAgency( - url_id=url_id, - agency_id=agency_id, - ) - links.append(link) - await adb_client_test.add_all(links) - - # Set up pdap client to return same URLs with different information - # - different name - # - different description - # - different status - # - different approval status (approved vs. not relevant) - # - different record type - # - different agencies assigned - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[0, 1], - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=new_agency_ids, - approval_status=ApprovalStatus.REJECTED, - ds_url_status=DataSourcesURLStatus.BROKEN, - ), - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - - # Confirm URL name, description, record type, and status are overwritten - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 2 - assert all([url.status == URLStatus.NOT_FOUND for url in urls]) - assert all([url.record_type == RecordType.ACCIDENT_REPORTS for url in urls]) - url_ids: list[int] = [url.id for url in urls] - - # Confirm agencies are overwritten - links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) - assert len(links) == 4 - assert set(link.url_id for link in links) == set(url_ids) - assert set(link.agency_id for link in links) == set(new_agency_ids) - - # Confirm validated types overwritten - flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) - assert len(flags) == 2 - assert all([flag.type == URLType.NOT_RELEVANT for flag in flags]) - assert set(flag.url_id for flag in flags) == set(url_ids) - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py deleted file mode 100644 index 157353ab..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ - set_up_mock_pdap_client_responses, set_up_sync_response_info -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error - - -@pytest.mark.asyncio -async def test_url_ok_approved( - operator: SyncDataSourcesTaskOperator, - adb_client_test: AsyncDatabaseClient, - agency_ids: list[int] -): - """ - Test that a URL with an OK URL status and an approved status - is added to the database with an OK status - and a validated flag with `submitted=True` - """ - - # Set up pdap client to return url with ok url status and approved - set_up_mock_pdap_client_responses( - mock_pdap_client=operator.pdap_client, - responses=[ - set_up_sync_response_info( - ids=[0, 1], - record_type=RecordType.OTHER, - agency_ids=agency_ids, - approval_status=ApprovalStatus.APPROVED, - ds_url_status=DataSourcesURLStatus.OK, - ), - ] - ) - - # Run operator - run_info: TaskOperatorRunInfo = await operator.run_task() - - # Confirm operator ran without error - assert_task_ran_without_error(run_info) - - # Check sync concluded - operator.pdap_client.sync_data_sources.call_count == 2 - - # Confirm URL is added to database with OK status - urls: list[URL] = await adb_client_test.get_all(URL) - assert len(urls) == 2 - assert all([url.status == URLStatus.OK for url in urls]) - assert all([url.record_type == RecordType.OTHER for url in urls]) - url_ids: list[int] = [url.id for url in urls] - - # Confirm presence of validated flag - flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) - assert len(flags) == 2 - assert all([flag.type == URLType.DATA_SOURCE for flag in flags]) - assert set(flag.url_id for flag in flags) == set(url_ids) diff --git a/tests/automated/integration/tasks/scheduled/loader/test_flags.py b/tests/automated/integration/tasks/scheduled/loader/test_flags.py index ae399c64..9476390d 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_flags.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_flags.py @@ -7,8 +7,6 @@ from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.run_url_tasks.operator import RunURLTasksTaskOperator -from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase @@ -23,14 +21,6 @@ class Config: operator: type[ScheduledTaskOperatorBase] params: list[FlagTestParams] = [ - FlagTestParams( - env_var="SYNC_AGENCIES_TASK_FLAG", - operator=SyncAgenciesTaskOperator - ), - FlagTestParams( - env_var="SYNC_DATA_SOURCES_TASK_FLAG", - operator=SyncDataSourcesTaskOperator - ), FlagTestParams( env_var="PUSH_TO_HUGGING_FACE_TASK_FLAG", operator=PushToHuggingFaceTaskOperator diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index f2dd795c..d7c43e97 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 8 +NUMBER_OF_ENTRIES = 6 @pytest.mark.asyncio async def test_happy_path( diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py b/tests/automated/integration/tasks/url/impl/validate/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py rename to tests/automated/integration/tasks/url/impl/validate/__init__.py diff --git a/tests/automated/integration/tasks/url/impl/validate/conftest.py b/tests/automated/integration/tasks/url/impl/validate/conftest.py new file mode 100644 index 00000000..0bcc5712 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/conftest.py @@ -0,0 +1,32 @@ +import pytest +import pytest_asyncio + +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> AutoValidateURLTaskOperator: + return AutoValidateURLTaskOperator( + adb_client=adb_client_test, + ) + +@pytest_asyncio.fixture +async def helper( + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo +) -> TestValidateTaskHelper: + url_id: int = (await db_data_creator.create_urls(count=1, record_type=None))[0].url_id + agency_id: int = await db_data_creator.agency() + return TestValidateTaskHelper( + db_data_creator, + url_id=url_id, + agency_id=agency_id, + location_id=pittsburgh_locality.location_id + ) + diff --git a/tests/automated/integration/tasks/url/impl/validate/helper.py b/tests/automated/integration/tasks/url/impl/validate/helper.py new file mode 100644 index 00000000..85b13695 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/helper.py @@ -0,0 +1,120 @@ +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.auto_validated.sqlalchemy import FlagURLAutoValidated +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.conftest import db_data_creator +from tests.helpers.counter import next_int +from tests.helpers.data_creator.core import DBDataCreator + +DEFAULT_RECORD_TYPE: RecordType = RecordType.INCARCERATION_RECORDS + +class TestValidateTaskHelper: + + def __init__( + self, + db_data_creator: DBDataCreator, + url_id: int, + agency_id: int, + location_id: int + ): + self.db_data_creator = db_data_creator + self.adb_client: AsyncDatabaseClient = db_data_creator.adb_client + self.url_id = url_id + self.agency_id = agency_id + self.location_id = location_id + + + async def check_url_validated( + self, + url_type: URLType, + ) -> None: + validated_flags: list[FlagURLValidated] = await self.adb_client.get_all(FlagURLValidated) + assert len(validated_flags) == 1 + validated_flag: FlagURLValidated = validated_flags[0] + assert validated_flag.url_id == self.url_id + assert validated_flag.type == url_type + + async def check_auto_validated( + self, + ) -> None: + auto_validated_flags: list[FlagURLAutoValidated] = await self.adb_client.get_all(FlagURLAutoValidated) + assert len(auto_validated_flags) == 1 + auto_validated_flag: FlagURLAutoValidated = auto_validated_flags[0] + assert auto_validated_flag.url_id == self.url_id + + async def check_agency_linked( + self + ) -> None: + links: list[LinkURLAgency] = await self.adb_client.get_all(LinkURLAgency) + assert len(links) == 1 + link: LinkURLAgency = links[0] + assert link.url_id == self.url_id + assert link.agency_id == self.agency_id + + async def check_record_type( + self, + record_type: RecordType = DEFAULT_RECORD_TYPE + ): + record_types: list[URLRecordType] = await self.adb_client.get_all(URLRecordType) + assert len(record_types) == 1 + rt: URLRecordType = record_types[0] + assert rt.url_id == self.url_id + assert rt.record_type == record_type + + async def add_url_type_suggestions( + self, + url_type: URLType, + count: int = 1 + ): + for _ in range(count): + await self.db_data_creator.user_relevant_suggestion( + suggested_status=url_type, + url_id=self.url_id, + user_id=next_int() + ) + + async def add_agency_suggestions( + self, + count: int = 1, + agency_id: int | None = None + ): + if agency_id is None: + agency_id = self.agency_id + for i in range(count): + await self.db_data_creator.agency_user_suggestions( + url_id=self.url_id, + user_id=next_int(), + agency_annotation_info=URLAgencyAnnotationPostInfo( + suggested_agency=agency_id + ) + ) + + async def add_location_suggestions( + self, + count: int = 1, + location_id: int | None = None + ): + if location_id is None: + location_id = self.location_id + for i in range(count): + await self.db_data_creator.add_user_location_suggestion( + url_id=self.url_id, + user_id=next_int(), + location_id=location_id, + ) + + async def add_record_type_suggestions( + self, + count: int = 1, + record_type: RecordType = DEFAULT_RECORD_TYPE + ): + for i in range(count): + await self.db_data_creator.user_record_type_suggestion( + url_id=self.url_id, + record_type=record_type, + user_id=next_int() + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py new file mode 100644 index 00000000..500d147c --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py @@ -0,0 +1,62 @@ +""" +Add a URL with two of the same suggestions for each of the following: +- Agency +- Location +- Record Type +- URL Type (DATA SOURCE) +And confirm it is validated as DATA SOURCE +""" +import pytest + +from src.core.enums import RecordType +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_data_source( + operator: AutoValidateURLTaskOperator, + helper: TestValidateTaskHelper +): + await helper.add_url_type_suggestions( + url_type=URLType.DATA_SOURCE, + count=2 + ) + + assert not await operator.meets_task_prerequisites() + + await helper.add_agency_suggestions(count=2) + + assert not await operator.meets_task_prerequisites() + + await helper.add_location_suggestions(count=2) + + assert not await operator.meets_task_prerequisites() + + await helper.add_record_type_suggestions(count=2) + + assert await operator.meets_task_prerequisites() + + # Add different record type suggestion + await helper.add_record_type_suggestions( + count=2, + record_type=RecordType.STOPS + ) + + # Assert no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add tiebreaker + await helper.add_record_type_suggestions() + + assert await operator.meets_task_prerequisites() + + await run_task_and_confirm_success(operator) + + await helper.check_url_validated(URLType.DATA_SOURCE) + await helper.check_auto_validated() + await helper.check_agency_linked() + await helper.check_record_type() + diff --git a/tests/automated/integration/tasks/url/impl/validate/test_individual_record.py b/tests/automated/integration/tasks/url/impl/validate/test_individual_record.py new file mode 100644 index 00000000..664b52d4 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/test_individual_record.py @@ -0,0 +1,53 @@ +import pytest + +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_individual_record( + operator: AutoValidateURLTaskOperator, + helper: TestValidateTaskHelper +): + """ + Add URL with 2 INDIVIDUAL RECORD suggestions. Check validated as INDIVIDUAL RECORD + """ + # Add two INDIVIDUAL record suggestions + await helper.add_url_type_suggestions( + url_type=URLType.INDIVIDUAL_RECORD, + count=2 + ) + + assert not await operator.meets_task_prerequisites() + + await helper.add_agency_suggestions(count=2) + + assert not await operator.meets_task_prerequisites() + + await helper.add_location_suggestions(count=2) + + assert await operator.meets_task_prerequisites() + + # Add additional agency suggestions to create tie + additional_agency_id: int = await helper.db_data_creator.agency() + await helper.add_agency_suggestions( + count=2, + agency_id=additional_agency_id + ) + + # Confirm no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add tiebreaker suggestion + await helper.add_agency_suggestions() + + assert await operator.meets_task_prerequisites() + + await run_task_and_confirm_success(operator) + + await helper.check_url_validated(URLType.INDIVIDUAL_RECORD) + await helper.check_auto_validated() + await helper.check_agency_linked() + diff --git a/tests/automated/integration/tasks/url/impl/validate/test_meta_url.py b/tests/automated/integration/tasks/url/impl/validate/test_meta_url.py new file mode 100644 index 00000000..be88157f --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/test_meta_url.py @@ -0,0 +1,60 @@ +""" +Add a URL with two of the same suggestions for each of the following: +- Agency +- Location +- URL Type (META URL) +And confirm it is validated as META URL +""" +import pytest + +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_meta_url( + operator: AutoValidateURLTaskOperator, + helper: TestValidateTaskHelper, + allegheny_county: CountyCreationInfo +): + # Add two META URL suggestions + await helper.add_url_type_suggestions(URLType.META_URL, count=2) + + # Assert operator does not yet meet task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add two Agency suggestions + await helper.add_agency_suggestions(count=2) + + # Assert operator does not yet meet task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add two location suggestions + await helper.add_location_suggestions(count=2) + + # Assert operator now meets task prerequisites + assert await operator.meets_task_prerequisites() + + # Add additional two location suggestions for different location + await helper.add_location_suggestions( + count=2, + location_id=allegheny_county.location_id + ) + + # Assert operator no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add additional location suggestion as tiebreaker + await helper.add_location_suggestions() + + # Assert operator again meets task prerequisites + assert await operator.meets_task_prerequisites() + + await run_task_and_confirm_success(operator) + + await helper.check_url_validated(URLType.META_URL) + await helper.check_auto_validated() + await helper.check_agency_linked() diff --git a/tests/automated/integration/tasks/url/impl/validate/test_not_relevant.py b/tests/automated/integration/tasks/url/impl/validate/test_not_relevant.py new file mode 100644 index 00000000..288f61e9 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/validate/test_not_relevant.py @@ -0,0 +1,56 @@ +import pytest + +from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_not_relevant( + operator: AutoValidateURLTaskOperator, + helper: TestValidateTaskHelper +): + """ + Add URL with 2 NOT RELEVANT suggestions. Check validated as NOT RELEVANT + """ + + # Assert operator does not yet meet task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add one NOT RELEVANT suggestion + await helper.add_url_type_suggestions( + url_type=URLType.NOT_RELEVANT, + ) + + # Assert operator does not yet meet task prerequisites + assert not await operator.meets_task_prerequisites() + + # Add second NOT RELEVANT suggestion + await helper.add_url_type_suggestions( + url_type=URLType.NOT_RELEVANT, + ) + + # Assert operator now meets task prerequisites + assert await operator.meets_task_prerequisites() + + # Add different suggestion to create tie + await helper.add_url_type_suggestions( + url_type=URLType.META_URL, + count=2 + ) + assert not await operator.meets_task_prerequisites() + + # Add tiebreaker + await helper.add_url_type_suggestions( + url_type=URLType.NOT_RELEVANT + ) + + await run_task_and_confirm_success(operator) + + # Assert URL validated as NOT RELEVANT + await helper.check_url_validated( + url_type=URLType.NOT_RELEVANT, + ) + + await helper.check_auto_validated() diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index 2ff92e69..7ba76a79 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS = 11 +NUMBER_OF_TASK_OPERATORS = 12 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/automated/unit/api/test_all_annotation_post_info.py b/tests/automated/unit/api/test_all_annotation_post_info.py index 549f6d79..c3b7a526 100644 --- a/tests/automated/unit/api/test_all_annotation_post_info.py +++ b/tests/automated/unit/api/test_all_annotation_post_info.py @@ -42,8 +42,8 @@ class TestAllAnnotationPostInfoParams(BaseModel): TestAllAnnotationPostInfoParams( suggested_status=URLType.INDIVIDUAL_RECORD, record_type=None, - agency_ids=[], - location_ids=[], + agency_ids=[1, 2], + location_ids=[3, 4], raise_exception=False ), # Error Paths - Meta URL diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index eb7ef3f7..17032b60 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -25,6 +25,7 @@ from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -619,6 +620,19 @@ async def add_compressed_html( ] await self.adb_client.add_all(compressed_html_inserts) + async def add_user_location_suggestion( + self, + url_id: int, + user_id: int, + location_id: int, + ): + suggestion = UserLocationSuggestion( + url_id=url_id, + user_id=user_id, + location_id=location_id, + ) + await self.adb_client.add(suggestion) + async def add_location_suggestion( self, url_id: int, diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index fb3c20ad..200a34cd 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -12,6 +12,7 @@ from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic from tests.helpers.counter import COUNTER, next_int from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links @@ -40,11 +41,20 @@ async def create_urls( urls: list[URLInsertModel] = generate_urls( status=status, source=source, - record_type=record_type, collector_metadata=collector_metadata, count=count, ) url_ids = await adb_client.bulk_insert(urls, return_ids=True) + if record_type is not None: + record_types: list[URLRecordTypePydantic] = [ + URLRecordTypePydantic( + url_id=url_id, + record_type=record_type, + ) + for url_id in url_ids + ] + await adb_client.bulk_insert(record_types) + return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] async def create_validated_flags( diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index ad730a71..1cf0a806 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -41,7 +41,6 @@ def generate_batch_url_links( def generate_urls( status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, - record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, count: int = 1 ) -> list[URLInsertModel]: @@ -54,7 +53,6 @@ def generate_urls( source=source, name=f"Example {val}", collector_metadata=collector_metadata, - record_type=record_type, )) return results diff --git a/tests/manual/external/pdap/sync/__init__.py b/tests/manual/external/pdap/sync/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/manual/external/pdap/sync/test_sync_agencies.py b/tests/manual/external/pdap/sync/test_sync_agencies.py deleted file mode 100644 index f5af7a7e..00000000 --- a/tests/manual/external/pdap/sync/test_sync_agencies.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest -import time - -from pendulum import tomorrow - -from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters - - -@pytest.mark.asyncio -async def test_sync_agencies(pdap_client_dev): - - start = time.perf_counter() - response = await pdap_client_dev.sync_agencies( - params=AgencySyncParameters( - page=1, - cutoff_date=None - ) - ) - end = time.perf_counter() - print(response) - - duration = end - start - print(f"Duration: {duration:.4f} seconds") - -@pytest.mark.asyncio -async def test_sync_agencies_cutoff(pdap_client_dev): - - start = time.perf_counter() - response = await pdap_client_dev.sync_agencies( - params=AgencySyncParameters( - page=1, - cutoff_date=tomorrow() - ) - ) - end = time.perf_counter() - print(response) -