From a0d0e5ebc61dda4ae372f39cbd0ac1f8fb4693ad Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 09:35:41 -0500 Subject: [PATCH 1/3] Begin draft --- ...1923-5ac9d50b91c5_add_integrity_monitor.py | 98 +++++++++++++++++++ .../scheduled/impl/integrity/__init__.py | 0 .../scheduled/impl/integrity/operator.py | 9 ++ src/db/enums.py | 1 + src/db/models/helpers.py | 3 + src/db/models/mixins.py | 9 +- src/db/models/views/dependent_locations.py | 3 +- src/db/models/views/integrity/__init__.py | 0 .../integrity/incomplete_data_sources.py | 35 +++++++ .../views/integrity/incomplete_meta_urls.py | 34 +++++++ .../non_federal_agencies_no_location.py | 25 +++++ .../url_both_data_source_and_meta_url.py | 21 ++++ src/db/models/views/meta_url.py | 9 +- src/db/models/views/unvalidated_url.py | 11 +-- src/db/models/views/url_anno_count.py | 9 +- src/db/models/views/url_annotations_flags.py | 9 +- src/db/models/views/url_status/core.py | 9 +- .../scheduled/impl/integrity/__init__.py | 0 .../scheduled/impl/integrity/conftest.py | 11 +++ .../integrity/test_incomplete_data_sources.py | 10 ++ .../integrity/test_incomplete_meta_urls.py | 10 ++ .../test_non_federal_agencies_no_location.py | 26 +++++ .../test_url_both_data_source_and_meta_url.py | 10 ++ 23 files changed, 314 insertions(+), 38 deletions(-) create mode 100644 alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py create mode 100644 src/core/tasks/scheduled/impl/integrity/__init__.py create mode 100644 src/core/tasks/scheduled/impl/integrity/operator.py create mode 100644 src/db/models/views/integrity/__init__.py create mode 100644 src/db/models/views/integrity/incomplete_data_sources.py create mode 100644 src/db/models/views/integrity/incomplete_meta_urls.py create mode 100644 src/db/models/views/integrity/non_federal_agencies_no_location.py create mode 100644 src/db/models/views/integrity/url_both_data_source_and_meta_url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py new file mode 100644 index 00000000..75f41186 --- /dev/null +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -0,0 +1,98 @@ +"""Add integrity monitor + +Revision ID: 5ac9d50b91c5 +Revises: 1bb2dfad3275 +Create Date: 2025-11-23 19:23:45.487445 + +""" +from typing import Sequence, Union + +from alembic import op + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '5ac9d50b91c5' +down_revision: Union[str, None] = '1bb2dfad3275' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + + +def upgrade() -> None: + _create_integrity_task() + _create_incomplete_data_sources_view() + _create_incomplete_meta_urls_view() + _create_url_both_data_source_and_meta_url_view() + _create_non_federal_agencies_no_location_view() + +def _create_non_federal_agencies_no_location_view(): + op.execute(""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.name + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """) + + +def _create_url_both_data_source_and_meta_url_view(): + op.execute(""" + create view integrity__url_both_data_source_and_meta_url_view + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id + """) + + +def _create_incomplete_meta_urls_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or urt.url_id is null + """) + + +def _create_incomplete_data_sources_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type + + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + """) + + +def _create_integrity_task(): + add_enum_value( + enum_name="task_type", + enum_value="Integrity Monitor", + ) + + +def downgrade() -> None: + pass diff --git a/src/core/tasks/scheduled/impl/integrity/__init__.py b/src/core/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py new file mode 100644 index 00000000..c9796c6d --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -0,0 +1,9 @@ +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase + + +class IntegrityMonitorTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): + pass \ No newline at end of file diff --git a/src/db/enums.py b/src/db/enums.py index 034ec0b8..65f446c5 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -63,6 +63,7 @@ class TaskType(PyEnum): TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" UPDATE_URL_STATUS = "Update URL Status" + INTEGRITY_MONITOR = "Integrity Monitor" # Sync Tasks SYNC_AGENCIES_ADD = "Sync Agencies Add" diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 592973a6..e1c77978 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -46,9 +46,12 @@ def location_id_column() -> Column[int]: CURRENT_TIME_SERVER_DEFAULT = func.now() +VIEW_ARG = {"info": "view"} + def url_id_primary_key_constraint() -> PrimaryKeyConstraint: return PrimaryKeyConstraint('url_id') + def county_column(nullable: bool = False) -> Column[int]: return Column( Integer(), diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 417eae40..7a7d6460 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -3,7 +3,8 @@ from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event from src.db.models.exceptions import WriteToViewError -from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT +from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT, url_id_primary_key_constraint, \ + VIEW_ARG class URLDependentMixin: @@ -90,3 +91,9 @@ def __declare_last__(cls) -> None: @staticmethod def _block_write(mapper, connection, target): raise WriteToViewError(f"{type(target).__name__} is a read-only view.") + +class URLDependentViewMixin(URLDependentMixin, ViewMixin): + __table_args__ = ( + url_id_primary_key_constraint(), + VIEW_ARG + ) \ No newline at end of file diff --git a/src/db/models/views/dependent_locations.py b/src/db/models/views/dependent_locations.py index 95f3db98..425e25a6 100644 --- a/src/db/models/views/dependent_locations.py +++ b/src/db/models/views/dependent_locations.py @@ -31,6 +31,7 @@ """ from sqlalchemy import Column, Integer, ForeignKey +from src.db.models.helpers import VIEW_ARG from src.db.models.mixins import ViewMixin from src.db.models.templates_.base import Base @@ -39,7 +40,7 @@ class DependentLocationView(Base, ViewMixin): __tablename__ = "dependent_locations" __table_args__ = ( - {"info": "view"} + VIEW_ARG, ) parent_location_id = Column( diff --git a/src/db/models/views/integrity/__init__.py b/src/db/models/views/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py new file mode 100644 index 00000000..8444b2e6 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -0,0 +1,35 @@ +""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type + + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteDataSource( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_data_sources_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_record_type = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py new file mode 100644 index 00000000..4c7ec01d --- /dev/null +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -0,0 +1,34 @@ +""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or urt.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_meta_urls_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + + diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py new file mode 100644 index 00000000..e45882fe --- /dev/null +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -0,0 +1,25 @@ +""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.name + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """ +from sqlalchemy import String, Column + +from src.db.models.helpers import VIEW_ARG +from src.db.models.mixins import ViewMixin +from src.db.models.templates_.base import Base + +class IntegrityNonFederalAgenciesNoLocation( + Base, + ViewMixin +): + __tablename__ = "integrity__non_federal_agencies_no_location_view" + __table_args__ = ( + VIEW_ARG, + ) + + name = Column(String) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..eac08d03 --- /dev/null +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -0,0 +1,21 @@ +""" + create view integrity__url_both_data_source_and_meta_url_view + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id +""" + +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class IntegrityURLBothDataSourceAndMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__url_both_data_source_and_meta_url_view" + + diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py index 20437075..a2d64ca9 100644 --- a/src/db/models/views/meta_url.py +++ b/src/db/models/views/meta_url.py @@ -9,18 +9,13 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class MetaURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): __tablename__ = "meta_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py index bcfa9293..baf5f071 100644 --- a/src/db/models/views/unvalidated_url.py +++ b/src/db/models/views/unvalidated_url.py @@ -11,18 +11,13 @@ """ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class UnvalidatedURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): - __tablename__ = "unvalidated_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file + __tablename__ = "unvalidated_url_view" \ No newline at end of file diff --git a/src/db/models/views/url_anno_count.py b/src/db/models/views/url_anno_count.py index 232f0d21..2e910afb 100644 --- a/src/db/models/views/url_anno_count.py +++ b/src/db/models/views/url_anno_count.py @@ -98,21 +98,16 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Integer from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationCount( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_count_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) auto_agency_count = Column(Integer, nullable=False) auto_location_count = Column(Integer, nullable=False) diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 47250d1b..c133fbfc 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -24,20 +24,15 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Boolean -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationFlagsView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_flags" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) has_auto_record_type_suggestion = Column(Boolean, nullable=False) has_auto_relevant_suggestion = Column(Boolean, nullable=False) diff --git a/src/db/models/views/url_status/core.py b/src/db/models/views/url_status/core.py index 77a01139..be771fe5 100644 --- a/src/db/models/views/url_status/core.py +++ b/src/db/models/views/url_status/core.py @@ -59,19 +59,14 @@ from sqlalchemy import String, Column from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLStatusMatView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_status_mat_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) status = Column(String) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py b/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py new file mode 100644 index 00000000..a3d3cd22 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -0,0 +1,11 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> IntegrityMonitorTaskOperator: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py new file mode 100644 index 00000000..41ea653d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -0,0 +1,26 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass + + # Check does not meet prerequisites + + # Add federal agency + + # Check does not meet prerequisites + + # Add non-federal agency + + # Check meets prerequisites + + # Run task and confirm produces error + + # Add location to non-federal agency + + # Check no longer meets task prerequisites diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass From 5ed52f37193c872fcf1d8a960c398acec386164f Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 24 Nov 2025 12:00:23 -0500 Subject: [PATCH 2/3] Continue draft --- .../impl/flag/url_validated/sqlalchemy.py | 3 +- .../tasks/scheduled/impl/integrity/helpers.py | 12 +++++ .../integrity/test_incomplete_data_sources.py | 54 +++++++++++++++++++ .../integrity/test_incomplete_meta_urls.py | 22 ++++++++ .../test_non_federal_agencies_no_location.py | 9 ++++ .../test_url_both_data_source_and_meta_url.py | 35 ++++++++++++ 6 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py index 97abf056..081441d8 100644 --- a/src/db/models/impl/flag/url_validated/sqlalchemy.py +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -1,4 +1,5 @@ from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType @@ -19,7 +20,7 @@ class FlagURLValidated( ), ) - type = enum_column( + type: Mapped[URLType] = enum_column( enum_type=URLType, name="url_type", ) diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py new file mode 100644 index 00000000..60177c3f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -0,0 +1,12 @@ +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome + + +async def run_task_and_confirm_error( + operator: IntegrityMonitorTaskOperator, + expected_error: str +) -> None: + run_info: TaskOperatorRunInfo = await operator.run_task() + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert run_info.error_message == expected_error \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py index 1553c409..d716a7da 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -1,6 +1,14 @@ import pytest +from src.core.enums import RecordType from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +16,49 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source but without record type or validated flag + ## URL + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False + ) + url_id: int = await operator.adb_client.add(url, return_id=True) + + ## App Link + ds_app_link = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add validated URL flag + flag = FlagURLValidated( + url_id=url_id, + type=URLType.DATA_SOURCE + ) + await operator.adb_client.add(flag) + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Add record type to data source + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.INCARCERATION_RECORDS + ) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py index 1553c409..def7cf9f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -1,6 +1,7 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +9,24 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add Meta URL without linking an agency to it + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Add agency to Meta URL + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py index 41ea653d..e77c0b31 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -1,6 +1,7 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -10,17 +11,25 @@ async def test_core( pass # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() # Add federal agency # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() # Add non-federal agency # Check meets prerequisites + assert await operator.meets_task_prerequisites() # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) # Add location to non-federal agency # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py index 1553c409..cce6269e 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -1,6 +1,9 @@ import pytest +from sqlalchemy import delete from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +11,35 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add same URL as Meta URL + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Delete data source link + statement = ( + delete( + DSAppLinkDataSource + ).where( + DSAppLinkDataSource.url_id == url_id + ) + ) + await operator.adb_client.execute(statement) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() From 5ffda4768b66df9a84247c716c52f11cfc421813 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 17:46:02 -0500 Subject: [PATCH 3/3] Finish integrity monitor draft --- ENV.md | 47 +++++++------- ...1923-5ac9d50b91c5_add_integrity_monitor.py | 35 ++++++----- .../scheduled/impl/integrity/exceptions.py | 4 ++ .../scheduled/impl/integrity/operator.py | 23 ++++++- .../impl/integrity/queries/__init__.py | 0 .../scheduled/impl/integrity/queries/cte.py | 61 +++++++++++++++++++ .../scheduled/impl/integrity/queries/get.py | 20 ++++++ .../impl/integrity/queries/prereq.py | 16 +++++ src/core/tasks/scheduled/loader.py | 8 +++ .../integrity/incomplete_data_sources.py | 15 +++-- .../views/integrity/incomplete_meta_urls.py | 14 +++-- .../non_federal_agencies_no_location.py | 10 +-- .../url_both_data_source_and_meta_url.py | 2 +- tests/automated/integration/conftest.py | 15 +++++ .../scheduled/impl/integrity/conftest.py | 4 +- .../tasks/scheduled/impl/integrity/helpers.py | 4 +- .../integrity/test_incomplete_data_sources.py | 36 +++++------ .../integrity/test_incomplete_meta_urls.py | 30 +++++++-- .../test_non_federal_agencies_no_location.py | 28 +++++++-- .../test_url_both_data_source_and_meta_url.py | 27 +++++--- .../tasks/scheduled/loader/test_happy_path.py | 2 +- 21 files changed, 304 insertions(+), 97 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/integrity/exceptions.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/prereq.py diff --git a/ENV.md b/ENV.md index a4ae17a7..386dbdae 100644 --- a/ENV.md +++ b/ENV.md @@ -57,29 +57,30 @@ Note that some tasks/subtasks are themselves enabled by other tasks. ### Scheduled Task Flags -| Flag | Description | -|----------------------------------------|-------------------------------------------------------------------------------| -| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | -| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | -| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | -| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | -| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | -| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | -| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | -| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | -| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | -| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | -| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | -| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| -| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| -| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| -| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| -| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| -| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| Flag | Description | +|--------------------------------------------|-------------------------------------------------------------------------------| +| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | +| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | +| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | +| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | +| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | +| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | +| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | +| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | +| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | +| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| `INTEGRITY_MONITOR_TASK_FLAG` | Runs integrity checks. | ### URL Task Flags diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py index 75f41186..1f44dd25 100644 --- a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -17,9 +17,6 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None - - - def upgrade() -> None: _create_integrity_task() _create_incomplete_data_sources_view() @@ -31,7 +28,7 @@ def _create_non_federal_agencies_no_location_view(): op.execute(""" create view integrity__non_federal_agencies_no_location_view as select - ag.name + ag.id as agency_id from agencies ag left join link_agencies__locations link on ag.id = link.agency_id where ag.jurisdiction_type != 'federal' @@ -41,7 +38,7 @@ def _create_non_federal_agencies_no_location_view(): def _create_url_both_data_source_and_meta_url_view(): op.execute(""" - create view integrity__url_both_data_source_and_meta_url_view + create view integrity__url_both_data_source_and_meta_url_view as select ds.url_id from @@ -53,18 +50,19 @@ def _create_url_both_data_source_and_meta_url_view(): def _create_incomplete_meta_urls_view(): op.execute(""" - create view integrity__incomplete_data_sources_view as + create view integrity__incomplete_meta_urls_view as select mu.url_id, fuv.url_id is not null as has_validated_flag, - fuv.type as validated_type + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id where fuv.url_id is null - or fuv.type != 'meta url' - or urt.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null """) @@ -72,18 +70,20 @@ def _create_incomplete_data_sources_view(): op.execute(""" create view integrity__incomplete_data_sources_view as select - mu.url_id, + ds.url_id, fuv.url_id is not null as has_validated_flag, fuv.type as validated_type, - urt.url_id is not null as has_record_type - - from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id where fuv.url_id is null or fuv.type != 'data source' or urt.url_id is null + or lau.url_id is null """) @@ -93,6 +93,5 @@ def _create_integrity_task(): enum_value="Integrity Monitor", ) - def downgrade() -> None: pass diff --git a/src/core/tasks/scheduled/impl/integrity/exceptions.py b/src/core/tasks/scheduled/impl/integrity/exceptions.py new file mode 100644 index 00000000..3e9f797e --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/exceptions.py @@ -0,0 +1,4 @@ + + +class IntegrityMonitorTaskException(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py index c9796c6d..42ca43bb 100644 --- a/src/core/tasks/scheduled/impl/integrity/operator.py +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -1,9 +1,30 @@ from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.integrity.exceptions import IntegrityMonitorTaskException +from src.core.tasks.scheduled.impl.integrity.queries.get import GetIntegrityTaskDataQueryBuilder +from src.core.tasks.scheduled.impl.integrity.queries.prereq import GetIntegrityTaskPrerequisitesQueryBuilder from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType class IntegrityMonitorTaskOperator( ScheduledTaskOperatorBase, HasPrerequisitesMixin ): - pass \ No newline at end of file + + @property + def task_type(self) -> TaskType: + return TaskType.INTEGRITY_MONITOR + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + query_builder=GetIntegrityTaskPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + failing_views: list[str] = await self.run_query_builder( + query_builder=GetIntegrityTaskDataQueryBuilder() + ) + raise IntegrityMonitorTaskException( + f"Integrity Monitor Task failed for the following views {failing_views}", + ) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/__init__.py b/src/core/tasks/scheduled/impl/integrity/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/queries/cte.py b/src/core/tasks/scheduled/impl/integrity/queries/cte.py new file mode 100644 index 00000000..dc894ea7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/cte.py @@ -0,0 +1,61 @@ +from sqlalchemy import select, literal, Exists, Label, or_ + +from src.db.models.templates_.base import Base +from src.db.models.views.integrity.incomplete_data_sources import IntegrityIncompleteDataSource +from src.db.models.views.integrity.incomplete_meta_urls import IntegrityIncompleteMetaURL +from src.db.models.views.integrity.non_federal_agencies_no_location import IntegrityNonFederalAgenciesNoLocation +from src.db.models.views.integrity.url_both_data_source_and_meta_url import IntegrityURLBothDataSourceAndMetaURL + + +def any_row_exists( + model: type[Base] +) -> Exists: + return ( + select( + literal(1) + ) + .select_from( + model + ) + .exists() + ) + +class IntegrityTaskCTEContainer: + + def __init__( + self, + ): + self.models: list[type[Base]] = [ + IntegrityURLBothDataSourceAndMetaURL, + IntegrityNonFederalAgenciesNoLocation, + IntegrityIncompleteMetaURL, + IntegrityIncompleteDataSource, + ] + + expressions: list[Label[bool]] = [ + any_row_exists(model) + .label(model.__tablename__) + for model in self.models + ] + + self.cte = ( + select( + *expressions + ) + .cte( + name="integrity_task_cte", + ) + ) + + @property + def any_rows_exist_query(self) -> select: + expression = [ + getattr(self.cte.c, model.__tablename__) + for model in self.models + ] + return select(or_(*expression)) + + @property + def select_all_columns_query(self) -> select: + return select(self.cte) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/get.py b/src/core/tasks/scheduled/impl/integrity/queries/get.py new file mode 100644 index 00000000..b8632fa2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/get.py @@ -0,0 +1,20 @@ +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskDataQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[str]: + cte = IntegrityTaskCTEContainer() + mapping: RowMapping = await self.sh.mapping( + session=session, + query=cte.select_all_columns_query + ) + return [ + model.__tablename__ + for model in cte.models + if mapping[model.__tablename__] + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/queries/prereq.py b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py new file mode 100644 index 00000000..12a6fa33 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py @@ -0,0 +1,16 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> Any: + cte = IntegrityTaskCTEContainer() + return await self.sh.scalar( + session=session, + query=cte.any_rows_exist_query + ) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 394a60ce..116bf56d 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -6,6 +6,7 @@ from src.core.tasks.scheduled.impl.delete_logs.operator import DeleteOldLogsTaskOperator from src.core.tasks.scheduled.impl.delete_stale_screenshots.operator import DeleteStaleScreenshotsTaskOperator from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.mark_never_completed.operator import MarkTaskNeverCompletedOperator @@ -127,6 +128,13 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") ), + ScheduledTaskEntry( + operator=IntegrityMonitorTaskOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.DAILY.value, + enabled=self.setup_flag("INTEGRITY_MONITOR_TASK_FLAG") + ), # Sync ## Adds ### Agency diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py index 8444b2e6..06efa3b4 100644 --- a/src/db/models/views/integrity/incomplete_data_sources.py +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -1,18 +1,20 @@ """ create view integrity__incomplete_data_sources_view as select - mu.url_id, + ds.url_id, fuv.url_id is not null as has_validated_flag, fuv.type as validated_type, - urt.url_id is not null as has_record_type - - from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id where fuv.url_id is null or fuv.type != 'data source' or urt.url_id is null + or lau.url_id is null """ from sqlalchemy import Column, Boolean @@ -33,3 +35,4 @@ class IntegrityIncompleteDataSource( name="url_type", ) has_record_type = Column(Boolean) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py index 4c7ec01d..a837c156 100644 --- a/src/db/models/views/integrity/incomplete_meta_urls.py +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -1,16 +1,17 @@ """ - create view integrity__incomplete_data_sources_view as + create view integrity__incomplete_meta_urls_view as select mu.url_id, fuv.url_id is not null as has_validated_flag, - fuv.type as validated_type + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id where fuv.url_id is null - or fuv.type != 'meta url' - or urt.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null """ from sqlalchemy import Column, Boolean @@ -30,5 +31,6 @@ class IntegrityIncompleteMetaURL( enum_type=URLType, name="url_type", ) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py index e45882fe..73e547b9 100644 --- a/src/db/models/views/integrity/non_federal_agencies_no_location.py +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -1,24 +1,26 @@ """ create view integrity__non_federal_agencies_no_location_view as select - ag.name + ag.id as agency_id from agencies ag left join link_agencies__locations link on ag.id = link.agency_id where ag.jurisdiction_type != 'federal' and link.location_id is null """ -from sqlalchemy import String, Column +from sqlalchemy import String, Column, PrimaryKeyConstraint from src.db.models.helpers import VIEW_ARG -from src.db.models.mixins import ViewMixin +from src.db.models.mixins import ViewMixin, AgencyDependentMixin from src.db.models.templates_.base import Base class IntegrityNonFederalAgenciesNoLocation( Base, - ViewMixin + ViewMixin, + AgencyDependentMixin, ): __tablename__ = "integrity__non_federal_agencies_no_location_view" __table_args__ = ( + PrimaryKeyConstraint("agency_id"), VIEW_ARG, ) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py index eac08d03..0de88314 100644 --- a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -1,5 +1,5 @@ """ - create view integrity__url_both_data_source_and_meta_url_view + create view integrity__url_both_data_source_and_meta_url_view as select ds.url_id from diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 6e2be0f0..4c6a76d0 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -6,6 +6,7 @@ from starlette.testclient import TestClient from src.api.main import app +from src.collectors.enums import URLStatus from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.enums import RecordType @@ -14,6 +15,8 @@ from src.db.client.sync import DatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions from src.security.manager import get_access_info @@ -218,6 +221,18 @@ async def test_url_data_source_id( ) return url_id +@pytest_asyncio.fixture +async def test_url_id( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + @pytest_asyncio.fixture async def test_url_data_source_mapping( db_data_creator: DBDataCreator, diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py index a3d3cd22..9106f1b7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -8,4 +8,6 @@ def operator( adb_client_test: AsyncDatabaseClient ) -> IntegrityMonitorTaskOperator: - raise NotImplementedError \ No newline at end of file + return IntegrityMonitorTaskOperator( + adb_client=adb_client_test, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py index 60177c3f..2b617ca2 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -5,8 +5,8 @@ async def run_task_and_confirm_error( operator: IntegrityMonitorTaskOperator, - expected_error: str + expected_view: str ) -> None: run_info: TaskOperatorRunInfo = await operator.run_task() assert run_info.outcome == TaskOperatorOutcome.ERROR - assert run_info.error_message == expected_error \ No newline at end of file + assert expected_view in run_info.message \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py index d716a7da..3381d7f0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -4,8 +4,7 @@ from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @@ -13,25 +12,17 @@ @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_url_id: int, + test_agency_id: int ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add URL as data source but without record type or validated flag - ## URL - url = URL( - url="example.com", - source=URLSource.COLLECTOR, - trailing_slash=False - ) - url_id: int = await operator.adb_client.add(url, return_id=True) - ## App Link ds_app_link = DSAppLinkDataSource( - url_id=url_id, + url_id=test_url_id, ds_data_source_id=1 ) await operator.adb_client.add(ds_app_link) @@ -41,7 +32,7 @@ async def test_core( # Add validated URL flag flag = FlagURLValidated( - url_id=url_id, + url_id=test_url_id, type=URLType.DATA_SOURCE ) await operator.adb_client.add(flag) @@ -51,14 +42,25 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__incomplete_data_sources_view" ) # Add record type to data source record_type = URLRecordType( - url_id=url_id, + url_id=test_url_id, record_type=RecordType.INCARCERATION_RECORDS ) + await operator.adb_client.add(record_type) + + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add agency to data source + agency = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(agency) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py index def7cf9f..9c3a147d 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -1,19 +1,36 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_agency_id: int, + test_url_id: int ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add Meta URL without linking an agency to it + ## Validated Flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.META_URL + ) + await operator.adb_client.add(flag) + + ## App Link + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -21,10 +38,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__incomplete_meta_urls_view" ) # Add agency to Meta URL + link = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(link) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py index e77c0b31..ee189f64 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -1,24 +1,39 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + pittsburgh_locality: LocalityCreationInfo ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add federal agency + agency = Agency( + name="Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.FEDERAL + ) + await operator.adb_client.add(agency) # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add non-federal agency + agency = Agency( + name="Non-Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.LOCAL + ) + agency_id: int =await operator.adb_client.add(agency, return_id=True) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -26,10 +41,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__non_federal_agencies_no_location_view" ) # Add location to non-federal agency + link = LinkAgencyLocation( + agency_id=agency_id, + location_id=pittsburgh_locality.location_id + ) + await operator.adb_client.add(link) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py index cce6269e..fa36a269 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -3,24 +3,33 @@ from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_url_data_source_id: int ): - pass # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() - # Add URL as data source - - # Check does not meet prerequisites - assert not await operator.meets_task_prerequisites() + # Add DS App Link + ds_app_link_ds = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link_ds) # Add same URL as Meta URL + ## App Link + ds_app_link_mu = DSAppLinkMetaURL( + url_id=test_url_data_source_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link_mu) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -28,15 +37,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__url_both_data_source_and_meta_url_view" ) # Delete data source link statement = ( delete( - DSAppLinkDataSource + DSAppLinkMetaURL ).where( - DSAppLinkDataSource.url_id == url_id + DSAppLinkMetaURL.url_id == test_url_data_source_id ) ) await operator.adb_client.execute(statement) diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index ae41bc30..4e5bb551 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 20 +NUMBER_OF_ENTRIES = 21 @pytest.mark.asyncio async def test_happy_path(