Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ Note that some tasks/subtasks are themselves enabled by other tasks.
| Flag | Description |
|-------------------------------------|--------------------------------------------------------------------|
| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. |
| `SYNC_AGENCIES_TASK_FLAG` | Synchonize agencies from Data Sources App. |
| `SYNC_DATA_SOURCES_TASK_FLAG` | Synchonize data sources from Data Sources App. |
| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. |
| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. |
| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. |
Expand All @@ -86,6 +84,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
| `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. |
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
| `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. |
| `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. |

### Agency ID Subtasks

Expand Down
127 changes: 127 additions & 0 deletions alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Add URL record type

Revision ID: e6a1a1b3bad4
Revises: 6b3db0c19f9b
Create Date: 2025-09-22 19:16:01.744304

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from src.util.alembic_helpers import url_id_column, created_at_column, id_column

# revision identifiers, used by Alembic.
revision: str = 'e6a1a1b3bad4'
down_revision: Union[str, None] = '6b3db0c19f9b'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

URL_RECORD_TYPE_TABLE_NAME = "url_record_type"




def upgrade() -> None:

Check warning on line 27 in alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py#L27 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py:27:1: D103 Missing docstring in public function

Check failure on line 27 in alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py#L27 <303>

too many blank lines (4)
Raw output
./alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py:27:1: E303 too many blank lines (4)
_create_url_record_type_table()
_migrate_url_record_types_to_url_record_type_table()
_drop_record_type_column()
_drop_agencies_sync_state()
_drop_data_sources_sync_state()

def _drop_agencies_sync_state():
op.drop_table("agencies_sync_state")


def _drop_data_sources_sync_state():
op.drop_table("data_sources_sync_state")


def _create_data_sources_sync_state():
table = op.create_table(
"data_sources_sync_state",
id_column(),
sa.Column('last_full_sync_at', sa.DateTime(), nullable=True),
sa.Column('current_cutoff_date', sa.Date(), nullable=True),
sa.Column('current_page', sa.Integer(), nullable=True),
)
# Add row to `data_sources_sync_state` table
op.bulk_insert(
table,
[
{
"last_full_sync_at": None,
"current_cutoff_date": None,
"current_page": None
}
]
)


def _create_agencies_sync_state():
table = op.create_table(
'agencies_sync_state',
id_column(),
sa.Column('last_full_sync_at', sa.DateTime(), nullable=True),
sa.Column('current_cutoff_date', sa.Date(), nullable=True),
sa.Column('current_page', sa.Integer(), nullable=True),
)

# Add row to `agencies_sync_state` table
op.bulk_insert(
table,
[
{
"last_full_sync_at": None,
"current_cutoff_date": None,
"current_page": None
}
]
)


def downgrade() -> None:

Check warning on line 85 in alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py#L85 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_09_22_1916-e6a1a1b3bad4_add_url_record_type.py:85:1: D103 Missing docstring in public function
_add_record_type_column()
_migrate_url_record_types_from_url_record_type_table()
_drop_url_record_type_table()
_create_agencies_sync_state()
_create_data_sources_sync_state()

def _drop_record_type_column():
op.drop_column("urls", "record_type")

def _add_record_type_column():
op.add_column("urls", sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=True))


def _create_url_record_type_table():
op.create_table(
URL_RECORD_TYPE_TABLE_NAME,
url_id_column(primary_key=True),
sa.Column("record_type", postgresql.ENUM(name="record_type", create_type=False), nullable=False),
created_at_column()
)


def _drop_url_record_type_table():
op.drop_table(URL_RECORD_TYPE_TABLE_NAME)


def _migrate_url_record_types_from_url_record_type_table():
op.execute("""
UPDATE urls
SET record_type = url_record_type.record_type
FROM url_record_type
WHERE urls.id = url_record_type.url_id
""")


def _migrate_url_record_types_to_url_record_type_table():
op.execute("""
INSERT INTO url_record_type (url_id, record_type)
SELECT id, record_type
FROM urls
WHERE record_type IS NOT NULL
""")
35 changes: 24 additions & 11 deletions src/api/endpoints/annotate/all/post/models/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
location_ids: list[int]

@model_validator(mode="after")
def forbid_record_type_if_meta_url(self):
if self.suggested_status == URLType.META_URL and self.record_type is not None:
def forbid_record_type_if_meta_url_or_individual_record(self):

Check warning on line 15 in src/api/endpoints/annotate/all/post/models/request.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/annotate/all/post/models/request.py#L15 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/annotate/all/post/models/request.py:15:1: D102 Missing docstring in public method
if self.suggested_status not in [
URLType.META_URL,
URLType.INDIVIDUAL_RECORD,
]:
return self
if self.record_type is not None:
raise FailedValidationException("record_type must be None if suggested_status is META_URL")
return self

Expand All @@ -24,31 +29,39 @@
return self

@model_validator(mode="after")
def require_location_if_meta_url_or_data_source(self):
if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]:
def require_location_if_relevant(self):

Check warning on line 32 in src/api/endpoints/annotate/all/post/models/request.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/annotate/all/post/models/request.py#L32 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/annotate/all/post/models/request.py:32:1: D102 Missing docstring in public method
if self.suggested_status not in [
URLType.META_URL,
URLType.DATA_SOURCE,
URLType.INDIVIDUAL_RECORD,
]:
return self
if len(self.location_ids) == 0:
raise FailedValidationException("location_ids must be provided if suggested_status is META_URL or DATA_SOURCE")
return self

@model_validator(mode="after")
def require_agency_id_if_meta_url_or_data_source(self):
if self.suggested_status not in [URLType.META_URL, URLType.DATA_SOURCE]:
def require_agency_id_if_relevant(self):

Check warning on line 44 in src/api/endpoints/annotate/all/post/models/request.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/annotate/all/post/models/request.py#L44 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/annotate/all/post/models/request.py:44:1: D102 Missing docstring in public method
if self.suggested_status not in [
URLType.META_URL,
URLType.DATA_SOURCE,
URLType.INDIVIDUAL_RECORD,
]:
return self
if len(self.agency_ids) == 0:
raise FailedValidationException("agencies must be provided if suggested_status is META_URL or DATA_SOURCE")
return self

@model_validator(mode="after")
def forbid_all_else_if_not_meta_url_or_data_source(self):
if self.suggested_status in [URLType.META_URL, URLType.DATA_SOURCE]:
def forbid_all_else_if_not_relevant(self):

Check warning on line 56 in src/api/endpoints/annotate/all/post/models/request.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/annotate/all/post/models/request.py#L56 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/annotate/all/post/models/request.py:56:1: D102 Missing docstring in public method
if self.suggested_status != URLType.NOT_RELEVANT:
return self
if self.record_type is not None:
raise FailedValidationException("record_type must be None if suggested_status is not META_URL or DATA_SOURCE")
raise FailedValidationException("record_type must be None if suggested_status is NOT RELEVANT")
if len(self.agency_ids) > 0:
raise FailedValidationException("agency_ids must be empty if suggested_status is not META_URL or DATA_SOURCe")
raise FailedValidationException("agency_ids must be empty if suggested_status is NOT RELEVANT")
if len(self.location_ids) > 0:
raise FailedValidationException("location_ids must be empty if suggested_status is not META_URL or DATA_SOURCE")
raise FailedValidationException("location_ids must be empty if suggested_status is NOT RELEVANT")
return self


Expand Down
5 changes: 1 addition & 4 deletions src/api/endpoints/annotate/all/post/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@ async def run(self, session: AsyncSession) -> None:
session.add(relevant_suggestion)

# If not relevant, do nothing else
if not self.post_info.suggested_status in [
URLType.META_URL,
URLType.DATA_SOURCE
]:
if self.post_info.suggested_status == URLType.NOT_RELEVANT:
return

locations: list[UserLocationSuggestion] = []
Expand Down
18 changes: 14 additions & 4 deletions src/api/endpoints/collector/manual/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from src.db.models.impl.url.core.enums import URLSource
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.queries.base.builder import QueryBuilderBase


Expand Down Expand Up @@ -37,9 +38,9 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
session.add(batch)
await session.flush()

batch_id = batch.id
url_ids = []
duplicate_urls = []
batch_id: int = batch.id
url_ids: list[int] = []
duplicate_urls: list[str] = []

for entry in self.dto.entries:
url = URL(
Expand All @@ -48,10 +49,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
description=entry.description,
collector_metadata=entry.collector_metadata,
status=URLStatus.OK.value,
record_type=entry.record_type.value if entry.record_type is not None else None,
source=URLSource.MANUAL
)


async with session.begin_nested():
try:
session.add(url)
Expand All @@ -60,6 +61,15 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
duplicate_urls.append(entry.url)
continue
await session.flush()

if entry.record_type is not None:
record_type = URLRecordType(
url_id=url.id,
record_type=entry.record_type,
)
session.add(record_type)


link = LinkBatchURL(
batch_id=batch_id,
url_id=url.id
Expand Down
18 changes: 10 additions & 8 deletions src/api/endpoints/review/approve/query_/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.impl.url.reviewing_user import ReviewingUserURL
from src.db.queries.base.builder import QueryBuilderBase

Expand All @@ -34,7 +35,7 @@ async def run(self, session: AsyncSession) -> None:

url = await self._get_url(session)

await self._optionally_update_record_type(url)
await self._optionally_update_record_type(session)

# Get existing agency ids
existing_agencies = url.confirmed_agencies or []
Expand Down Expand Up @@ -88,14 +89,15 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None:
self.approval_info.supplying_entity
)

async def _optionally_update_record_type(self, url: URL) -> None:
update_if_not_none(
url,
"record_type",
self.approval_info.record_type.value
if self.approval_info.record_type is not None else None,
required=True
async def _optionally_update_record_type(self, session: AsyncSession) -> None:
if self.approval_info.record_type is None:
return

record_type = URLRecordType(
url_id=self.approval_info.url_id,
record_type=self.approval_info.record_type.value
)
session.add(record_type)

async def _get_url(self, session: AsyncSession) -> URL:
query = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.queries.base.builder import QueryBuilderBase
from src.db.utils.compression import decompress_html

Expand All @@ -33,10 +34,14 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut
select(
URL.id.label(label_url_id),
URL.url.label(label_url),
URL.record_type.label(label_record_type_fine),
URLRecordType.record_type.label(label_record_type_fine),
URLCompressedHTML.compressed_html.label(label_html),
FlagURLValidated.type.label(label_type)
)
.join(
URLRecordType,
URL.id == URLRecordType.url_id
)
.join(
URLCompressedHTML,
URL.id == URLCompressedHTML.url_id
Expand Down

This file was deleted.

Loading