Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Note that some tasks/subtasks are themselves enabled by other tasks.
| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). |
| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. |
| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. |
| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. |

### URL Task Flags

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Add URL status view

Revision ID: 25b3fc777c31
Revises: 8b2adc95c5d7
Create Date: 2025-10-11 19:13:03.309461

"""
from typing import Sequence, Union

from alembic import op

from src.util.alembic_helpers import add_enum_value

# revision identifiers, used by Alembic.
revision: str = '25b3fc777c31'
down_revision: Union[str, None] = '8b2adc95c5d7'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 21 in alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py#L21 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py:21:1: D103 Missing docstring in public function
op.execute("""
CREATE MATERIALIZED VIEW url_status_mat_view AS
with
urls_with_relevant_errors as (
select
ute.url_id
from
url_task_error ute
where
ute.task_type in (
'Screenshot',
'HTML',
'URL Probe'
)
)
select
u.id as url_id,
case
when (
-- Validated as not relevant, individual record, or not found
fuv.type in ('not relevant', 'individual record', 'not found')
-- Has Meta URL in data sources app
OR udmu.url_id is not null
-- Has data source in data sources app
OR uds.url_id is not null
) Then 'Submitted/Pipeline Complete'
when fuv.type is not null THEN 'Accepted'
when (
-- Has compressed HTML
uch.url_id is not null
AND
-- Has web metadata
uwm.url_id is not null
AND
-- Has screenshot
us.url_id is not null
) THEN 'Community Labeling'
when uwre.url_id is not null then 'Error'
ELSE 'Intake'
END as status

from
urls u
left join urls_with_relevant_errors uwre
on u.id = uwre.url_id
left join url_screenshot us
on u.id = us.url_id
left join url_compressed_html uch
on u.id = uch.url_id
left join url_web_metadata uwm
on u.id = uwm.url_id
left join flag_url_validated fuv
on u.id = fuv.url_id
left join url_ds_meta_url udmu
on u.id = udmu.url_id
left join url_data_source uds
on u.id = uds.url_id
""")

add_enum_value(
enum_name="task_type",
enum_value="Refresh Materialized Views"
)


def downgrade() -> None:

Check warning on line 87 in alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py#L87 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_11_1913-25b3fc777c31_add_url_status_view.py:87:1: D103 Missing docstring in public function
pass
16 changes: 9 additions & 7 deletions src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

from pydantic import BaseModel

from src.db.models.impl.flag.url_validated.enums import URLType
from src.db.models.views.url_status.enums import URLStatusViewEnum

class GetMetricsURLValidatedOldestPendingURL(BaseModel):

Check warning on line 8 in src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py#L8 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py:8:1: D101 Missing docstring in public class
url_id: int
created_at: datetime.datetime

class GetMetricsURLsAggregatedResponseDTO(BaseModel):
count_urls_total: int
count_urls_pending: int
count_urls_submitted: int
count_urls_rejected: int
count_urls_validated: int
count_urls_errors: int
oldest_pending_url_created_at: datetime.datetime
oldest_pending_url_id: int
count_urls_status: dict[URLStatusViewEnum, int]
count_urls_type: dict[URLType, int]
oldest_pending_url: GetMetricsURLValidatedOldestPendingURL | None
52 changes: 19 additions & 33 deletions src/api/endpoints/metrics/urls/aggregated/query/core.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO
from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO, \
GetMetricsURLValidatedOldestPendingURL
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.all import ALL_SUBQUERY
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.error import ERROR_SUBQUERY
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.pending import PENDING_SUBQUERY
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.rejected import REJECTED_SUBQUERY
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.submitted import SUBMITTED_SUBQUERY
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.validated import VALIDATED_SUBQUERY
from src.collectors.enums import URLStatus
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.oldest_pending_url import \
GetOldestPendingURLQueryBuilder
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.status import GetURLStatusCountQueryBuilder
from src.api.endpoints.metrics.urls.aggregated.query.subqueries.url_type import GetURLTypeCountQueryBuilder
from src.db.helpers.session import session_helper as sh
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.flag.url_validated.enums import URLType
from src.db.models.views.url_status.enums import URLStatusViewEnum
from src.db.queries.base.builder import QueryBuilderBase


class GetURLsAggregatedMetricsQueryBuilder(QueryBuilderBase):

async def run(self, session: AsyncSession) -> GetMetricsURLsAggregatedResponseDTO:

oldest_pending_url_query = select(
URL.id,
URL.created_at
).where(
URL.status == URLStatus.OK.value
).order_by(
URL.created_at.asc()
).limit(1)

oldest_pending_url = await session.execute(oldest_pending_url_query)
oldest_pending_url = oldest_pending_url.one_or_none()
if oldest_pending_url is None:
oldest_pending_url_id = None
oldest_pending_created_at = None
else:
oldest_pending_url_id = oldest_pending_url.id
oldest_pending_created_at = oldest_pending_url.created_at
oldest_pending_url: GetMetricsURLValidatedOldestPendingURL | None = \
await GetOldestPendingURLQueryBuilder().run(session=session)

status_counts: dict[URLStatusViewEnum, int] = \
await GetURLStatusCountQueryBuilder().run(session=session)

validated_counts: dict[URLType, int] = \
await GetURLTypeCountQueryBuilder().run(session=session)

return GetMetricsURLsAggregatedResponseDTO(
count_urls_total=await sh.scalar(session, query=ALL_SUBQUERY),
count_urls_pending=await sh.scalar(session, query=PENDING_SUBQUERY),
count_urls_submitted=await sh.scalar(session, query=SUBMITTED_SUBQUERY),
count_urls_validated=await sh.scalar(session, query=VALIDATED_SUBQUERY),
count_urls_rejected=await sh.scalar(session, query=REJECTED_SUBQUERY),
count_urls_errors=await sh.scalar(session, query=ERROR_SUBQUERY),
oldest_pending_url_id=oldest_pending_url_id,
oldest_pending_url_created_at=oldest_pending_created_at,
oldest_pending_url=oldest_pending_url,
count_urls_status=status_counts,
count_urls_type=validated_counts,
)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from sqlalchemy import select, RowMapping

Check warning on line 1 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py:1:1: D100 Missing docstring in public module
from sqlalchemy.ext.asyncio import AsyncSession

from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLValidatedOldestPendingURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.views.url_status.core import URLStatusMatView
from src.db.models.views.url_status.enums import URLStatusViewEnum
from src.db.queries.base.builder import QueryBuilderBase

from src.db.helpers.session import session_helper as sh

Check warning on line 10 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py#L10 <401>

'src.db.helpers.session.session_helper as sh' imported but unused
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py:10:1: F401 'src.db.helpers.session.session_helper as sh' imported but unused

class GetOldestPendingURLQueryBuilder(QueryBuilderBase):

Check warning on line 12 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py#L12 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py:12:1: D101 Missing docstring in public class

async def run(

Check warning on line 14 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py#L14 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py:14:1: D102 Missing docstring in public method
self,
session: AsyncSession
) -> GetMetricsURLValidatedOldestPendingURL | None:

query = (
select(
URLStatusMatView.url_id,
URL.created_at
)
.join(
URL,
URLStatusMatView.url_id == URL.id
).where(
URLStatusMatView.status.not_in(
[
URLStatusViewEnum.SUBMITTED_PIPELINE_COMPLETE.value,
URLStatusViewEnum.ACCEPTED.value,
]
)
).order_by(
URL.created_at.asc()
).limit(1)
)

mapping: RowMapping | None = (await session.execute(query)).mappings().one_or_none()
if mapping is None:
return None

return GetMetricsURLValidatedOldestPendingURL(
url_id=mapping["url_id"],
created_at=mapping["created_at"],
)

Check warning on line 47 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py#L47 <391>

blank line at end of file
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py:47:1: W391 blank line at end of file

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Sequence

Check warning on line 1 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py:1:1: D100 Missing docstring in public module

from sqlalchemy import select, func, RowMapping
from sqlalchemy.ext.asyncio import AsyncSession

from src.db.helpers.session import session_helper as sh
from src.db.models.views.url_status.core import URLStatusMatView
from src.db.models.views.url_status.enums import URLStatusViewEnum
from src.db.queries.base.builder import QueryBuilderBase


class GetURLStatusCountQueryBuilder(QueryBuilderBase):

Check warning on line 12 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py#L12 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py:12:1: D101 Missing docstring in public class

async def run(

Check warning on line 14 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py#L14 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py:14:1: D102 Missing docstring in public method
self,
session: AsyncSession
) -> dict[URLStatusViewEnum, int]:

query = (
select(
URLStatusMatView.status,
func.count(
URLStatusMatView.url_id
).label("count")
)
.group_by(
URLStatusMatView.status
)
)

mappings: Sequence[RowMapping] = await sh.mappings(session, query=query)

return {
URLStatusViewEnum(mapping["status"]): mapping["count"]
for mapping in mappings
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Sequence

Check warning on line 1 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py:1:1: D100 Missing docstring in public module

from sqlalchemy import select, func, RowMapping
from sqlalchemy.ext.asyncio import AsyncSession

from src.db.helpers.session import session_helper as sh
from src.db.models.impl.flag.url_validated.enums import URLType
from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated
from src.db.queries.base.builder import QueryBuilderBase


class GetURLTypeCountQueryBuilder(QueryBuilderBase):

Check warning on line 12 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py#L12 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py:12:1: D101 Missing docstring in public class

async def run(

Check warning on line 14 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py#L14 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py:14:1: D102 Missing docstring in public method
self,
session: AsyncSession
) -> dict[URLType, int]:
query = (
select(
FlagURLValidated.type,
func.count(FlagURLValidated.url_id).label("count")
)
.group_by(
FlagURLValidated.type
)
)

mappings: Sequence[RowMapping] = await sh.mappings(session, query=query)

return {
mapping["type"]: mapping["count"]
for mapping in mappings
}

Check warning on line 33 in src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py#L33 <292>

no newline at end of file
Raw output
./src/api/endpoints/metrics/urls/aggregated/query/subqueries/url_type.py:33:10: W292 no newline at end of file

This file was deleted.

Empty file.
Loading