Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Add url health view

Revision ID: 7a6c2e1b9d44
Revises: 1fb2286a016c
Create Date: 2026-02-27 12:00:00.000000

"""
from typing import Sequence, Union

from alembic import op


# revision identifiers, used by Alembic.
revision: str = "7a6c2e1b9d44"
down_revision: Union[str, None] = "1fb2286a016c"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def _create_url_health_view() -> None:
op.execute(
"""
CREATE VIEW url_health_view AS
WITH latest_redirect AS (
SELECT DISTINCT ON (lur.source_url_id)
lur.source_url_id,
lur.destination_url_id
FROM link_urls_redirect_url lur
ORDER BY lur.source_url_id, lur.updated_at DESC, lur.created_at DESC
)
SELECT
u.id AS url_id,
CASE
WHEN uwm.status_code = 200 THEN 'OK'
WHEN lr.destination_url_id IS NOT NULL AND redirect_uwm.status_code = 200 THEN 'OK'
WHEN uiapm.archive_url IS NOT NULL THEN 'Archived'
ELSE 'Broken'
END AS health,
CASE
WHEN uwm.status_code = 200 THEN 100
WHEN lr.destination_url_id IS NOT NULL AND redirect_uwm.status_code = 200 THEN 150
WHEN uiapm.archive_url IS NOT NULL THEN 200
ELSE 300
END AS code,
uwm.status_code,
lr.destination_url_id AS redirect_url_id,
CASE
WHEN redirect_u.scheme IS NOT NULL AND redirect_u.trailing_slash = TRUE
THEN redirect_u.scheme || '://' || redirect_u.url || '/'
WHEN redirect_u.scheme IS NOT NULL AND redirect_u.trailing_slash = FALSE
THEN redirect_u.scheme || '://' || redirect_u.url
ELSE redirect_u.url
END AS redirect_url,
redirect_uwm.status_code AS redirect_status_code,
(lr.destination_url_id IS NOT NULL) AS has_redirect,
(lr.destination_url_id IS NOT NULL AND redirect_uwm.status_code = 200) AS redirect_is_healthy,
(uiapm.archive_url IS NOT NULL) AS has_archive,
uiapm.archive_url
FROM urls u
LEFT JOIN url_web_metadata uwm
ON uwm.url_id = u.id
LEFT JOIN latest_redirect lr
ON lr.source_url_id = u.id
LEFT JOIN urls redirect_u
ON redirect_u.id = lr.destination_url_id
LEFT JOIN url_web_metadata redirect_uwm
ON redirect_uwm.url_id = lr.destination_url_id
LEFT JOIN url_internet_archives_probe_metadata uiapm
ON uiapm.url_id = u.id
"""
)


def upgrade() -> None:

Check warning on line 74 in alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py#L74 <103>

Missing docstring in public function
Raw output
./alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py:74:1: D103 Missing docstring in public function
_create_url_health_view()


def downgrade() -> None:

Check warning on line 78 in alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py#L78 <103>

Missing docstring in public function
Raw output
./alembic/versions/2026_02_27_1200-7a6c2e1b9d44_add_url_health_materialized_view.py:78:1: D103 Missing docstring in public function
op.execute("DROP VIEW IF EXISTS url_health_view")
20 changes: 18 additions & 2 deletions src/api/endpoints/data_source/_shared/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.materialized_views.url_health.sqlalchemy import URLHealthMaterializedView


def build_data_source_get_query() -> Select:
Expand Down Expand Up @@ -39,7 +40,18 @@ def build_data_source_get_query() -> Select:
URLOptionalDataSourceMetadata.scraper_url,
URLOptionalDataSourceMetadata.submission_notes,
URLOptionalDataSourceMetadata.access_notes,
URLOptionalDataSourceMetadata.access_types
URLOptionalDataSourceMetadata.access_types,

URLHealthMaterializedView.health,
URLHealthMaterializedView.code,
URLHealthMaterializedView.status_code,
URLHealthMaterializedView.redirect_url_id,
URLHealthMaterializedView.redirect_url,
URLHealthMaterializedView.redirect_status_code,
URLHealthMaterializedView.has_redirect,
URLHealthMaterializedView.redirect_is_healthy,
URLHealthMaterializedView.has_archive,
URLHealthMaterializedView.archive_url,
)
.join(
URLRecordType,
Expand All @@ -60,7 +72,11 @@ def build_data_source_get_query() -> Select:
URLOptionalDataSourceMetadata,
URLOptionalDataSourceMetadata.url_id == URL.id
)
.outerjoin(
URLHealthMaterializedView,
URLHealthMaterializedView.url_id == URL.id,
)
.options(
selectinload(URL.confirmed_agencies),
)
)
)
24 changes: 21 additions & 3 deletions src/api/endpoints/data_source/_shared/process.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from sqlalchemy import RowMapping

from src.api.endpoints.data_source.get.response import DataSourceGetResponse
from src.api.endpoints.data_source.get.response import DataSourceGetResponse, DataSourceURLHealthResponse
from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.materialized_views.url_health.sqlalchemy import URLHealthMaterializedView


def process_data_source_get_mapping(
Expand All @@ -16,6 +17,22 @@ def process_data_source_get_mapping(
for agency in url.confirmed_agencies:
url_agency_ids.append(agency.id)

url_health: DataSourceURLHealthResponse | None = None
health_value = mapping[URLHealthMaterializedView.health]
if health_value is not None:
url_health = DataSourceURLHealthResponse(
value=health_value,
code=mapping[URLHealthMaterializedView.code],
http_status_code=mapping[URLHealthMaterializedView.status_code],
redirect_url_id=mapping[URLHealthMaterializedView.redirect_url_id],
redirect_url=mapping[URLHealthMaterializedView.redirect_url],
redirect_http_status_code=mapping[URLHealthMaterializedView.redirect_status_code],
has_redirect=mapping[URLHealthMaterializedView.has_redirect] or False,
redirect_is_healthy=mapping[URLHealthMaterializedView.redirect_is_healthy] or False,
has_archive=mapping[URLHealthMaterializedView.has_archive] or False,
archive_url=mapping[URLHealthMaterializedView.archive_url] or None,
)

return DataSourceGetResponse(
url_id=mapping[URL.id],
url=mapping[URL.url],
Expand All @@ -40,5 +57,6 @@ def process_data_source_get_mapping(
scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url],
submission_notes=mapping[URLOptionalDataSourceMetadata.submission_notes],
access_notes=mapping[URLOptionalDataSourceMetadata.access_notes],
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or []
)
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [],
url_health=url_health,
)
15 changes: 15 additions & 0 deletions src/api/endpoints/data_source/get/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@
from src.core.enums import RecordType
from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \
RetentionScheduleEnum, AccessTypeEnum
from src.db.models.materialized_views.url_health.enums import URLHealthViewEnum


class DataSourceURLHealthResponse(BaseModel):

Check warning on line 11 in src/api/endpoints/data_source/get/response.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/data_source/get/response.py#L11 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/data_source/get/response.py:11:1: D101 Missing docstring in public class
value: URLHealthViewEnum
code: int
http_status_code: int | None = None
redirect_url_id: int | None = None
redirect_url: str | None = None
redirect_http_status_code: int | None = None
has_redirect: bool = False
redirect_is_healthy: bool = False
has_archive: bool = False
archive_url: str | None = None


class DataSourceGetResponse(BaseModel):
Expand Down Expand Up @@ -38,6 +52,7 @@
submission_notes: str | None = None
access_notes: str | None = None
access_types: list[AccessTypeEnum]
url_health: DataSourceURLHealthResponse | None = None

class DataSourceGetOuterResponse(BaseModel):
results: list[DataSourceGetResponse]
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \
DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_health_to_ds_url_status
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata
from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView
from src.db.models.materialized_views.url_health.sqlalchemy import URLHealthMaterializedView
from src.db.queries.base.builder import QueryBuilderBase
from src.external.pdap.enums import DataSourcesURLStatus
from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel
from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest

Expand All @@ -42,7 +39,6 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
# Required
URL.full_url,
URL.name,
URLWebMetadata.status_code,
URLRecordType.record_type,
agency_id_cte.c.agency_ids,
# Optional
Expand All @@ -61,7 +57,9 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
URLOptionalDataSourceMetadata.scraper_url,
URLOptionalDataSourceMetadata.access_notes,
URLOptionalDataSourceMetadata.access_types,
URLInternetArchivesProbeMetadata.archive_url,
URLHealthMaterializedView.health,
URLHealthMaterializedView.has_archive,
URLHealthMaterializedView.archive_url,
)
.select_from(
cte.cte
Expand All @@ -75,12 +73,8 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
URL.id == URLOptionalDataSourceMetadata.url_id,
)
.outerjoin(
URLWebMetadata,
URL.id == URLWebMetadata.url_id
)
.outerjoin(
URLInternetArchivesProbeMetadata,
URL.id == URLInternetArchivesProbeMetadata.url_id,
URLHealthMaterializedView,
URL.id == URLHealthMaterializedView.url_id,
)
.join(
URLRecordType,
Expand Down Expand Up @@ -124,15 +118,15 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url],
access_notes=mapping[URLOptionalDataSourceMetadata.access_notes],
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [],
# TODO: Change to convert web metadata result to URL Status
url_status=convert_sm_url_status_to_ds_url_status(
mapping[URLWebMetadata.status_code],
url_status=convert_sm_url_health_to_ds_url_status(
health=mapping[URLHealthMaterializedView.health],
has_archive=mapping[URLHealthMaterializedView.has_archive] or False,
),
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
internet_archives_url=mapping[URLHealthMaterializedView.archive_url] or None,
)
)
)

return AddDataSourcesOuterRequest(
data_sources=inner_requests,
)
)
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from sqlalchemy import select, or_, Column, CTE
from sqlalchemy import select, or_, Column, CTE, and_, exists

Check warning on line 1 in src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py:1:1: D100 Missing docstring in public module

from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata


class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer:
Expand Down Expand Up @@ -32,6 +34,35 @@
URLOptionalDataSourceMetadata.updated_at > DSAppLinkDataSource.last_synced_at,
URLRecordType.created_at > DSAppLinkDataSource.last_synced_at,
URLRecordType.updated_at > DSAppLinkDataSource.last_synced_at,
exists(
select(URLWebMetadata.url_id).where(
and_(
URLWebMetadata.url_id == DSAppLinkDataSource.url_id,
URLWebMetadata.updated_at > DSAppLinkDataSource.last_synced_at,
)
)
),
exists(
select(LinkURLRedirectURL.source_url_id).where(
and_(
LinkURLRedirectURL.source_url_id == DSAppLinkDataSource.url_id,
LinkURLRedirectURL.updated_at > DSAppLinkDataSource.last_synced_at,
)
)
),
exists(
select(LinkURLRedirectURL.source_url_id)
.join(
URLWebMetadata,
URLWebMetadata.url_id == LinkURLRedirectURL.destination_url_id,
)
.where(
and_(
LinkURLRedirectURL.source_url_id == DSAppLinkDataSource.url_id,
URLWebMetadata.updated_at > DSAppLinkDataSource.last_synced_at,
)
)
),
)
).cte("ds_app_link_sync_data_source_update_prerequisites")
)
Expand All @@ -46,4 +77,4 @@

@property
def cte(self) -> CTE:
return self._cte
return self._cte
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \
DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_health_to_ds_url_status
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata
from src.db.models.materialized_views.url_health.sqlalchemy import URLHealthMaterializedView
from src.db.queries.base.builder import QueryBuilderBase
from src.external.pdap.enums import DataSourcesURLStatus
from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel
from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest, \
UpdateDataSourcesInnerRequest
Expand Down Expand Up @@ -42,7 +40,6 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
# Required
URL.full_url,
URL.name,
URLWebMetadata.status_code,
URLRecordType.record_type,
agency_id_cte.c.agency_ids,
# Optional
Expand All @@ -62,7 +59,9 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
URLOptionalDataSourceMetadata.access_notes,
URLOptionalDataSourceMetadata.access_types,
URLOptionalDataSourceMetadata.data_portal_type_other,
URLInternetArchivesProbeMetadata.archive_url,
URLHealthMaterializedView.health,
URLHealthMaterializedView.has_archive,
URLHealthMaterializedView.archive_url,
)
.select_from(
cte.cte
Expand All @@ -76,17 +75,13 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
URL.id == URLOptionalDataSourceMetadata.url_id,
)
.outerjoin(
URLInternetArchivesProbeMetadata,
URL.id == URLInternetArchivesProbeMetadata.url_id,
URLHealthMaterializedView,
URLHealthMaterializedView.url_id == URL.id,
)
.join(
URLRecordType,
URLRecordType.url_id == URL.id,
)
.outerjoin(
URLWebMetadata,
URLWebMetadata.url_id == URL.id,
)
.outerjoin(
agency_id_cte,
cte.url_id == agency_id_cte.c.url_id
Expand Down Expand Up @@ -126,10 +121,11 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
access_notes=mapping[URLOptionalDataSourceMetadata.access_notes],
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [],
data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other],
url_status=convert_sm_url_status_to_ds_url_status(
mapping[URLWebMetadata.status_code],
url_status=convert_sm_url_health_to_ds_url_status(
health=mapping[URLHealthMaterializedView.health],
has_archive=mapping[URLHealthMaterializedView.has_archive] or False,
),
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
internet_archives_url=mapping[URLHealthMaterializedView.archive_url] or None,
)
)
)
Expand Down
Loading