diff --git a/ENV.md b/ENV.md index a4ae17a7..386dbdae 100644 --- a/ENV.md +++ b/ENV.md @@ -57,29 +57,30 @@ Note that some tasks/subtasks are themselves enabled by other tasks. ### Scheduled Task Flags -| Flag | Description | -|----------------------------------------|-------------------------------------------------------------------------------| -| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | -| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | -| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | -| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | -| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | -| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | -| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | -| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | -| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | -| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | -| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | -| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| -| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| -| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| -| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| -| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| -| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| Flag | Description | +|--------------------------------------------|-------------------------------------------------------------------------------| +| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | +| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | +| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | +| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | +| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | +| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | +| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | +| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | +| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | +| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| `INTEGRITY_MONITOR_TASK_FLAG` | Runs integrity checks. | ### URL Task Flags diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py new file mode 100644 index 00000000..1f44dd25 --- /dev/null +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -0,0 +1,97 @@ +"""Add integrity monitor + +Revision ID: 5ac9d50b91c5 +Revises: 1bb2dfad3275 +Create Date: 2025-11-23 19:23:45.487445 + +""" +from typing import Sequence, Union + +from alembic import op + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '5ac9d50b91c5' +down_revision: Union[str, None] = '1bb2dfad3275' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def upgrade() -> None: + _create_integrity_task() + _create_incomplete_data_sources_view() + _create_incomplete_meta_urls_view() + _create_url_both_data_source_and_meta_url_view() + _create_non_federal_agencies_no_location_view() + +def _create_non_federal_agencies_no_location_view(): + op.execute(""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.id as agency_id + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """) + + +def _create_url_both_data_source_and_meta_url_view(): + op.execute(""" + create view integrity__url_both_data_source_and_meta_url_view as + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id + """) + + +def _create_incomplete_meta_urls_view(): + op.execute(""" + create view integrity__incomplete_meta_urls_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null + """) + + +def _create_incomplete_data_sources_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + ds.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + or lau.url_id is null + """) + + +def _create_integrity_task(): + add_enum_value( + enum_name="task_type", + enum_value="Integrity Monitor", + ) + +def downgrade() -> None: + pass diff --git a/src/core/tasks/scheduled/impl/integrity/__init__.py b/src/core/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/exceptions.py b/src/core/tasks/scheduled/impl/integrity/exceptions.py new file mode 100644 index 00000000..3e9f797e --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/exceptions.py @@ -0,0 +1,4 @@ + + +class IntegrityMonitorTaskException(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py new file mode 100644 index 00000000..42ca43bb --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -0,0 +1,30 @@ +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.integrity.exceptions import IntegrityMonitorTaskException +from src.core.tasks.scheduled.impl.integrity.queries.get import GetIntegrityTaskDataQueryBuilder +from src.core.tasks.scheduled.impl.integrity.queries.prereq import GetIntegrityTaskPrerequisitesQueryBuilder +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType + + +class IntegrityMonitorTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): + + @property + def task_type(self) -> TaskType: + return TaskType.INTEGRITY_MONITOR + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + query_builder=GetIntegrityTaskPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + failing_views: list[str] = await self.run_query_builder( + query_builder=GetIntegrityTaskDataQueryBuilder() + ) + raise IntegrityMonitorTaskException( + f"Integrity Monitor Task failed for the following views {failing_views}", + ) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/__init__.py b/src/core/tasks/scheduled/impl/integrity/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/queries/cte.py b/src/core/tasks/scheduled/impl/integrity/queries/cte.py new file mode 100644 index 00000000..dc894ea7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/cte.py @@ -0,0 +1,61 @@ +from sqlalchemy import select, literal, Exists, Label, or_ + +from src.db.models.templates_.base import Base +from src.db.models.views.integrity.incomplete_data_sources import IntegrityIncompleteDataSource +from src.db.models.views.integrity.incomplete_meta_urls import IntegrityIncompleteMetaURL +from src.db.models.views.integrity.non_federal_agencies_no_location import IntegrityNonFederalAgenciesNoLocation +from src.db.models.views.integrity.url_both_data_source_and_meta_url import IntegrityURLBothDataSourceAndMetaURL + + +def any_row_exists( + model: type[Base] +) -> Exists: + return ( + select( + literal(1) + ) + .select_from( + model + ) + .exists() + ) + +class IntegrityTaskCTEContainer: + + def __init__( + self, + ): + self.models: list[type[Base]] = [ + IntegrityURLBothDataSourceAndMetaURL, + IntegrityNonFederalAgenciesNoLocation, + IntegrityIncompleteMetaURL, + IntegrityIncompleteDataSource, + ] + + expressions: list[Label[bool]] = [ + any_row_exists(model) + .label(model.__tablename__) + for model in self.models + ] + + self.cte = ( + select( + *expressions + ) + .cte( + name="integrity_task_cte", + ) + ) + + @property + def any_rows_exist_query(self) -> select: + expression = [ + getattr(self.cte.c, model.__tablename__) + for model in self.models + ] + return select(or_(*expression)) + + @property + def select_all_columns_query(self) -> select: + return select(self.cte) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/get.py b/src/core/tasks/scheduled/impl/integrity/queries/get.py new file mode 100644 index 00000000..b8632fa2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/get.py @@ -0,0 +1,20 @@ +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskDataQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[str]: + cte = IntegrityTaskCTEContainer() + mapping: RowMapping = await self.sh.mapping( + session=session, + query=cte.select_all_columns_query + ) + return [ + model.__tablename__ + for model in cte.models + if mapping[model.__tablename__] + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/queries/prereq.py b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py new file mode 100644 index 00000000..12a6fa33 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py @@ -0,0 +1,16 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> Any: + cte = IntegrityTaskCTEContainer() + return await self.sh.scalar( + session=session, + query=cte.any_rows_exist_query + ) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 394a60ce..116bf56d 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -6,6 +6,7 @@ from src.core.tasks.scheduled.impl.delete_logs.operator import DeleteOldLogsTaskOperator from src.core.tasks.scheduled.impl.delete_stale_screenshots.operator import DeleteStaleScreenshotsTaskOperator from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.mark_never_completed.operator import MarkTaskNeverCompletedOperator @@ -127,6 +128,13 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") ), + ScheduledTaskEntry( + operator=IntegrityMonitorTaskOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.DAILY.value, + enabled=self.setup_flag("INTEGRITY_MONITOR_TASK_FLAG") + ), # Sync ## Adds ### Agency diff --git a/src/db/enums.py b/src/db/enums.py index 034ec0b8..65f446c5 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -63,6 +63,7 @@ class TaskType(PyEnum): TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" UPDATE_URL_STATUS = "Update URL Status" + INTEGRITY_MONITOR = "Integrity Monitor" # Sync Tasks SYNC_AGENCIES_ADD = "Sync Agencies Add" diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 592973a6..e1c77978 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -46,9 +46,12 @@ def location_id_column() -> Column[int]: CURRENT_TIME_SERVER_DEFAULT = func.now() +VIEW_ARG = {"info": "view"} + def url_id_primary_key_constraint() -> PrimaryKeyConstraint: return PrimaryKeyConstraint('url_id') + def county_column(nullable: bool = False) -> Column[int]: return Column( Integer(), diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py index 97abf056..081441d8 100644 --- a/src/db/models/impl/flag/url_validated/sqlalchemy.py +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -1,4 +1,5 @@ from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType @@ -19,7 +20,7 @@ class FlagURLValidated( ), ) - type = enum_column( + type: Mapped[URLType] = enum_column( enum_type=URLType, name="url_type", ) diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 417eae40..7a7d6460 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -3,7 +3,8 @@ from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event from src.db.models.exceptions import WriteToViewError -from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT +from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT, url_id_primary_key_constraint, \ + VIEW_ARG class URLDependentMixin: @@ -90,3 +91,9 @@ def __declare_last__(cls) -> None: @staticmethod def _block_write(mapper, connection, target): raise WriteToViewError(f"{type(target).__name__} is a read-only view.") + +class URLDependentViewMixin(URLDependentMixin, ViewMixin): + __table_args__ = ( + url_id_primary_key_constraint(), + VIEW_ARG + ) \ No newline at end of file diff --git a/src/db/models/views/dependent_locations.py b/src/db/models/views/dependent_locations.py index 95f3db98..425e25a6 100644 --- a/src/db/models/views/dependent_locations.py +++ b/src/db/models/views/dependent_locations.py @@ -31,6 +31,7 @@ """ from sqlalchemy import Column, Integer, ForeignKey +from src.db.models.helpers import VIEW_ARG from src.db.models.mixins import ViewMixin from src.db.models.templates_.base import Base @@ -39,7 +40,7 @@ class DependentLocationView(Base, ViewMixin): __tablename__ = "dependent_locations" __table_args__ = ( - {"info": "view"} + VIEW_ARG, ) parent_location_id = Column( diff --git a/src/db/models/views/integrity/__init__.py b/src/db/models/views/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py new file mode 100644 index 00000000..06efa3b4 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -0,0 +1,38 @@ +""" + create view integrity__incomplete_data_sources_view as + select + ds.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + or lau.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteDataSource( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_data_sources_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_record_type = Column(Boolean) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py new file mode 100644 index 00000000..a837c156 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -0,0 +1,36 @@ +""" + create view integrity__incomplete_meta_urls_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_meta_urls_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_agency_flag = Column(Boolean) + + diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py new file mode 100644 index 00000000..73e547b9 --- /dev/null +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -0,0 +1,27 @@ +""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.id as agency_id + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """ +from sqlalchemy import String, Column, PrimaryKeyConstraint + +from src.db.models.helpers import VIEW_ARG +from src.db.models.mixins import ViewMixin, AgencyDependentMixin +from src.db.models.templates_.base import Base + +class IntegrityNonFederalAgenciesNoLocation( + Base, + ViewMixin, + AgencyDependentMixin, +): + __tablename__ = "integrity__non_federal_agencies_no_location_view" + __table_args__ = ( + PrimaryKeyConstraint("agency_id"), + VIEW_ARG, + ) + + name = Column(String) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..0de88314 --- /dev/null +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -0,0 +1,21 @@ +""" + create view integrity__url_both_data_source_and_meta_url_view as + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id +""" + +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class IntegrityURLBothDataSourceAndMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__url_both_data_source_and_meta_url_view" + + diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py index 20437075..a2d64ca9 100644 --- a/src/db/models/views/meta_url.py +++ b/src/db/models/views/meta_url.py @@ -9,18 +9,13 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class MetaURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): __tablename__ = "meta_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py index bcfa9293..baf5f071 100644 --- a/src/db/models/views/unvalidated_url.py +++ b/src/db/models/views/unvalidated_url.py @@ -11,18 +11,13 @@ """ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class UnvalidatedURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): - __tablename__ = "unvalidated_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file + __tablename__ = "unvalidated_url_view" \ No newline at end of file diff --git a/src/db/models/views/url_anno_count.py b/src/db/models/views/url_anno_count.py index 232f0d21..2e910afb 100644 --- a/src/db/models/views/url_anno_count.py +++ b/src/db/models/views/url_anno_count.py @@ -98,21 +98,16 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Integer from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationCount( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_count_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) auto_agency_count = Column(Integer, nullable=False) auto_location_count = Column(Integer, nullable=False) diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 47250d1b..c133fbfc 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -24,20 +24,15 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Boolean -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationFlagsView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_flags" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) has_auto_record_type_suggestion = Column(Boolean, nullable=False) has_auto_relevant_suggestion = Column(Boolean, nullable=False) diff --git a/src/db/models/views/url_status/core.py b/src/db/models/views/url_status/core.py index 77a01139..be771fe5 100644 --- a/src/db/models/views/url_status/core.py +++ b/src/db/models/views/url_status/core.py @@ -59,19 +59,14 @@ from sqlalchemy import String, Column from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLStatusMatView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_status_mat_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) status = Column(String) \ No newline at end of file diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 6e2be0f0..4c6a76d0 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -6,6 +6,7 @@ from starlette.testclient import TestClient from src.api.main import app +from src.collectors.enums import URLStatus from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.enums import RecordType @@ -14,6 +15,8 @@ from src.db.client.sync import DatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions from src.security.manager import get_access_info @@ -218,6 +221,18 @@ async def test_url_data_source_id( ) return url_id +@pytest_asyncio.fixture +async def test_url_id( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + @pytest_asyncio.fixture async def test_url_data_source_mapping( db_data_creator: DBDataCreator, diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py b/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py new file mode 100644 index 00000000..9106f1b7 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -0,0 +1,13 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> IntegrityMonitorTaskOperator: + return IntegrityMonitorTaskOperator( + adb_client=adb_client_test, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py new file mode 100644 index 00000000..2b617ca2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -0,0 +1,12 @@ +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome + + +async def run_task_and_confirm_error( + operator: IntegrityMonitorTaskOperator, + expected_view: str +) -> None: + run_info: TaskOperatorRunInfo = await operator.run_task() + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert expected_view in run_info.message \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py new file mode 100644 index 00000000..3381d7f0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -0,0 +1,66 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_url_id: int, + test_agency_id: int +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source but without record type or validated flag + ## App Link + ds_app_link = DSAppLinkDataSource( + url_id=test_url_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add validated URL flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.DATA_SOURCE + ) + await operator.adb_client.add(flag) + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__incomplete_data_sources_view" + ) + + # Add record type to data source + record_type = URLRecordType( + url_id=test_url_id, + record_type=RecordType.INCARCERATION_RECORDS + ) + await operator.adb_client.add(record_type) + + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add agency to data source + agency = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(agency) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py new file mode 100644 index 00000000..9c3a147d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -0,0 +1,54 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_agency_id: int, + test_url_id: int +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add Meta URL without linking an agency to it + ## Validated Flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.META_URL + ) + await operator.adb_client.add(flag) + + ## App Link + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__incomplete_meta_urls_view" + ) + + # Add agency to Meta URL + link = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(link) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py new file mode 100644 index 00000000..ee189f64 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -0,0 +1,55 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + pittsburgh_locality: LocalityCreationInfo +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add federal agency + agency = Agency( + name="Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.FEDERAL + ) + await operator.adb_client.add(agency) + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add non-federal agency + agency = Agency( + name="Non-Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.LOCAL + ) + agency_id: int =await operator.adb_client.add(agency, return_id=True) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__non_federal_agencies_no_location_view" + ) + + # Add location to non-federal agency + link = LinkAgencyLocation( + agency_id=agency_id, + location_id=pittsburgh_locality.location_id + ) + await operator.adb_client.add(link) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..fa36a269 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -0,0 +1,54 @@ +import pytest +from sqlalchemy import delete + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_url_data_source_id: int +): + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link_ds = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link_ds) + + # Add same URL as Meta URL + ## App Link + ds_app_link_mu = DSAppLinkMetaURL( + url_id=test_url_data_source_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link_mu) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__url_both_data_source_and_meta_url_view" + ) + + # Delete data source link + statement = ( + delete( + DSAppLinkMetaURL + ).where( + DSAppLinkMetaURL.url_id == test_url_data_source_id + ) + ) + await operator.adb_client.execute(statement) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index ae41bc30..4e5bb551 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 20 +NUMBER_OF_ENTRIES = 21 @pytest.mark.asyncio async def test_happy_path(