Police-Data-Accessibility-Project · maxachis · Sep 24, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 23, 2025
@@ -60,8 +60,6 @@ Note that some tasks/subtasks are themselves enabled by other tasks.
 | Flag                                | Description                                                        |
 |-------------------------------------|--------------------------------------------------------------------|
 | `SCHEDULED_TASKS_FLAG`              | All scheduled tasks. Disabling disables all other scheduled tasks. |
-| `SYNC_AGENCIES_TASK_FLAG`           | Synchonize agencies from Data Sources App.             |
-| `SYNC_DATA_SOURCES_TASK_FLAG`       | Synchonize data sources from Data Sources App.         |
 | `PUSH_TO_HUGGING_FACE_TASK_FLAG`    | Pushes data to HuggingFace.                            |
 | `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot.                        |
 | `DELETE_OLD_LOGS_TASK_FLAG`         | Deletes old logs.                                      |
@@ -86,6 +84,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
 | `URL_PROBE_TASK_FLAG`               | Probes URLs for web metadata.                          |
 | `URL_ROOT_URL_TASK_FLAG`            | Extracts and links Root URLs to URLs.                  |
 | `URL_SCREENSHOT_TASK_FLAG`          | Takes screenshots of URLs.                             |
+| `URL_AUTO_VALIDATE_TASK_FLAG`       | Automatically validates URLs.                          |
 
 ### Agency ID Subtasks
 

@@ -0,0 +1,127 @@
+"""Add URL record type
+
+Revision ID: e6a1a1b3bad4
+Revises: 6b3db0c19f9b
+Create Date: 2025-09-22 19:16:01.744304
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from src.util.alembic_helpers import url_id_column, created_at_column, id_column
+
+# revision identifiers, used by Alembic.
+revision: str = 'e6a1a1b3bad4'
+down_revision: Union[str, None] = '6b3db0c19f9b'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+URL_RECORD_TYPE_TABLE_NAME = "url_record_type"
+
+
+
+
+def upgrade() -> None:
+    _create_url_record_type_table()
+    _migrate_url_record_types_to_url_record_type_table()
+    _drop_record_type_column()
+    _drop_agencies_sync_state()
+    _drop_data_sources_sync_state()
+
+def _drop_agencies_sync_state():
+    op.drop_table("agencies_sync_state")
+
+
+def _drop_data_sources_sync_state():
+    op.drop_table("data_sources_sync_state")
+
+
+def _create_data_sources_sync_state():
+    table = op.create_table(
+        "data_sources_sync_state",
+        id_column(),
+        sa.Column('last_full_sync_at', sa.DateTime(), nullable=True),
+        sa.Column('current_cutoff_date', sa.Date(), nullable=True),
+        sa.Column('current_page', sa.Integer(), nullable=True),
+    )
+    # Add row to `data_sources_sync_state` table
+    op.bulk_insert(
+        table,
+        [
+            {
+                "last_full_sync_at": None,
+                "current_cutoff_date": None,
+                "current_page": None
+            }
+        ]
+    )
+
+
+def _create_agencies_sync_state():
+    table = op.create_table(
+        'agencies_sync_state',
+        id_column(),
+        sa.Column('last_full_sync_at', sa.DateTime(), nullable=True),
+        sa.Column('current_cutoff_date', sa.Date(), nullable=True),
+        sa.Column('current_page', sa.Integer(), nullable=True),
+    )
+
+    # Add row to `agencies_sync_state` table
+    op.bulk_insert(
+        table,
+        [
+            {
+                "last_full_sync_at": None,
+                "current_cutoff_date": None,
+                "current_page": None
+            }
+        ]
+    )
+
+
+def downgrade() -> None:
+    _add_record_type_column()
+    _migrate_url_record_types_from_url_record_type_table()
+    _drop_url_record_type_table()
+    _create_agencies_sync_state()
+    _create_data_sources_sync_state()
+
+def _drop_record_type_column():
+    op.drop_column("urls", "record_type")
+
+def _add_record_type_column():
+    op.add_column("urls", sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=True))
+
+
+def _create_url_record_type_table():
+    op.create_table(
+        URL_RECORD_TYPE_TABLE_NAME,
+        url_id_column(primary_key=True),
+        sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=False),
+        created_at_column()
+    )
+
+
+def _drop_url_record_type_table():
+    op.drop_table(URL_RECORD_TYPE_TABLE_NAME)
+
+
+def _migrate_url_record_types_from_url_record_type_table():
+    op.execute("""
+    UPDATE urls
+    SET record_type = url_record_type.record_type
+    FROM url_record_type
+    WHERE urls.id = url_record_type.url_id
+    """)
+
+
+def _migrate_url_record_types_to_url_record_type_table():
+    op.execute("""
+    INSERT INTO url_record_type (url_id, record_type)
+    SELECT id, record_type
+    FROM urls
+    WHERE record_type IS NOT NULL
+    """)
@@ -12,8 +12,13 @@
     location_ids: list[int]
 
     @model_validator(mode="after")
-    def forbid_record_type_if_meta_url(self):
-        if self.suggested_status == URLType.META_URL and self.record_type is not None:
+    def forbid_record_type_if_meta_url_or_individual_record(self):
+        if self.suggested_status not in [
+            URLType.META_URL,
+            URLType.INDIVIDUAL_RECORD,
+        ]:
+            return self
+        if self.record_type is not None:
             raise FailedValidationException("record_type must be None if suggested_status is META_URL")
         return self
 
@@ -24,31 +29,39 @@
         return self
 
     @model_validator(mode="after")
-    def require_location_if_meta_url_or_data_source(self):
-        if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]:
+    def require_location_if_relevant(self):
+        if self.suggested_status not in [
+            URLType.META_URL,
+            URLType.DATA_SOURCE,
+            URLType.INDIVIDUAL_RECORD,
+        ]:
             return self
         if len(self.location_ids) == 0:
             raise FailedValidationException("location_ids must be provided if suggested_status is META_URL or DATA_SOURCE")
         return self
 
     @model_validator(mode="after")
-    def require_agency_id_if_meta_url_or_data_source(self):
-        if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]:
+    def require_agency_id_if_relevant(self):
+        if self.suggested_status not in [
+            URLType.META_URL,
+            URLType.DATA_SOURCE,
+            URLType.INDIVIDUAL_RECORD,
+        ]:
             return self
         if len(self.agency_ids) == 0:
             raise FailedValidationException("agencies must be provided if suggested_status is META_URL or DATA_SOURCE")
         return self
 
     @model_validator(mode="after")
-    def forbid_all_else_if_not_meta_url_or_data_source(self):
-        if self.suggested_status in [URLType.META_URL, URLType.DATA_SOURCE]:
+    def forbid_all_else_if_not_relevant(self):
+        if self.suggested_status != URLType.NOT_RELEVANT:
             return self
         if self.record_type is not None:
-            raise FailedValidationException("record_type must be None if suggested_status is not META_URL or DATA_SOURCE")
+            raise FailedValidationException("record_type must be None if suggested_status is NOT RELEVANT")
         if len(self.agency_ids) > 0:
-            raise FailedValidationException("agency_ids must be empty if suggested_status is not META_URL or DATA_SOURCe")
+            raise FailedValidationException("agency_ids must be empty if suggested_status is NOT RELEVANT")
         if len(self.location_ids) > 0:
-            raise FailedValidationException("location_ids must be empty if suggested_status is not META_URL or DATA_SOURCE")
+            raise FailedValidationException("location_ids must be empty if suggested_status is NOT RELEVANT")
         return self
 
 

@@ -33,10 +33,7 @@ async def run(self, session: AsyncSession) -> None:
         session.add(relevant_suggestion)
 
         # If not relevant, do nothing else
-        if not self.post_info.suggested_status in [
-            URLType.META_URL,
-            URLType.DATA_SOURCE
-        ]:
+        if self.post_info.suggested_status == URLType.NOT_RELEVANT:
             return
 
         locations: list[UserLocationSuggestion] = []

@@ -10,6 +10,7 @@
 from src.db.models.impl.url.core.enums import URLSource
 from src.db.models.impl.url.core.sqlalchemy import URL
 from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
+from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
 from src.db.queries.base.builder import QueryBuilderBase
 
 
@@ -37,9 +38,9 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
         session.add(batch)
         await session.flush()
 
-        batch_id = batch.id
-        url_ids = []
-        duplicate_urls = []
+        batch_id: int = batch.id
+        url_ids: list[int] = []
+        duplicate_urls: list[str] = []
 
         for entry in self.dto.entries:
             url = URL(
@@ -48,10 +49,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
                 description=entry.description,
                 collector_metadata=entry.collector_metadata,
                 status=URLStatus.OK.value,
-                record_type=entry.record_type.value if entry.record_type is not None else None,
                 source=URLSource.MANUAL
             )
 
+
             async with session.begin_nested():
                 try:
                     session.add(url)
@@ -60,6 +61,15 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
                     duplicate_urls.append(entry.url)
                     continue
             await session.flush()
+
+            if entry.record_type is not None:
+                record_type = URLRecordType(
+                    url_id=url.id,
+                    record_type=entry.record_type,
+                )
+                session.add(record_type)
+
+
             link = LinkBatchURL(
                 batch_id=batch_id,
                 url_id=url.id

@@ -14,6 +14,7 @@
 from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
 from src.db.models.impl.url.core.sqlalchemy import URL
 from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
+from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
 from src.db.models.impl.url.reviewing_user import ReviewingUserURL
 from src.db.queries.base.builder import QueryBuilderBase
 
@@ -34,7 +35,7 @@ async def run(self, session: AsyncSession) -> None:
 
         url = await self._get_url(session)
 
-        await self._optionally_update_record_type(url)
+        await self._optionally_update_record_type(session)
 
         # Get existing agency ids
         existing_agencies = url.confirmed_agencies or []
@@ -88,14 +89,15 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None:
                 self.approval_info.supplying_entity
             )
 
-    async def _optionally_update_record_type(self, url: URL) -> None:
-        update_if_not_none(
-            url,
-            "record_type",
-            self.approval_info.record_type.value
-            if self.approval_info.record_type is not None else None,
-            required=True
+    async def _optionally_update_record_type(self, session: AsyncSession) -> None:
+        if self.approval_info.record_type is None:
+            return
+
+        record_type = URLRecordType(
+            url_id=self.approval_info.url_id,
+            record_type=self.approval_info.record_type.value
         )
+        session.add(record_type)
 
     async def _get_url(self, session: AsyncSession) -> URL:
         query = (

@@ -10,6 +10,7 @@
 from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated
 from src.db.models.impl.url.core.sqlalchemy import URL
 from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML
+from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
 from src.db.queries.base.builder import QueryBuilderBase
 from src.db.utils.compression import decompress_html
 
@@ -33,10 +34,14 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut
             select(
                 URL.id.label(label_url_id),
                 URL.url.label(label_url),
-                URL.record_type.label(label_record_type_fine),
+                URLRecordType.record_type.label(label_record_type_fine),
                 URLCompressedHTML.compressed_html.label(label_html),
                 FlagURLValidated.type.label(label_type)
             )
+            .join(
+                URLRecordType,
+                URL.id == URLRecordType.url_id
+            )
             .join(
                 URLCompressedHTML,
                 URL.id == URLCompressedHTML.url_id