Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Create url_probed_for_404 table and adjust logic for 404 probe

Revision ID: b5f079b6b8cb
Revises: 864107b703ae
Create Date: 2025-05-13 12:34:46.846656

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from util.alembic_helpers import switch_enum_type

# revision identifiers, used by Alembic.
revision: str = 'b5f079b6b8cb'
down_revision: Union[str, None] = '864107b703ae'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 22 in alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py#L22 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py:22:1: D103 Missing docstring in public function
op.create_table(
'url_probed_for_404',
sa.Column('id', sa.Integer(), nullable=False, primary_key=True),
sa.Column('url_id', sa.Integer(), nullable=False),
sa.Column('last_probed_at', sa.DateTime(), nullable=False, server_default=sa.text('now()')),
)

# Add unique constraint to url_id column
op.create_unique_constraint('uq_url_probed_for_404_url_id', 'url_probed_for_404', ['url_id'])
# Add unique constraint for url_id column in url_checked_for_duplicate table
op.create_unique_constraint('uq_url_checked_for_duplicates_url_id', 'url_checked_for_duplicate', ['url_id'])

# Create new `404 Not Found` URL Status
switch_enum_type(
table_name='urls',
column_name='outcome',
enum_name='url_status',
new_enum_values=[
'pending',
'submitted',
'validated',
'duplicate',
'rejected',
'error',
'404 not found',
],
check_constraints_to_drop=['url_name_not_null_when_validated']
)

# Add '404 Probe' to TaskType Enum
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe'
]
)

op.execute(
"""

Check failure on line 70 in alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py#L70 <122>

continuation line missing indentation or outdented
Raw output
./alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py:70:5: E122 continuation line missing indentation or outdented
ALTER TABLE urls
ADD CONSTRAINT url_name_not_null_when_validated
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
"""
)

# Update existing error URLs with an error message of 404 Not Found
op.execute("""
UPDATE urls
SET outcome = '404 not found'
FROM url_error_info uei
WHERE urls.id = uei.url_id
AND urls.outcome = 'error'
AND uei.error LIKE '%404%';
""")


def downgrade() -> None:

Check warning on line 88 in alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py#L88 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py:88:1: D103 Missing docstring in public function
op.drop_table('url_probed_for_404')

# Drop unique constraint for url_id column in url_checked_for_duplicate table
op.drop_constraint('uq_url_checked_for_duplicates_url_id', 'url_checked_for_duplicate', type_='unique')

# Drop `404 Not Found` URL Status
op.execute("""
UPDATE urls
SET outcome = 'error'
WHERE outcome = '404 not found';
""")

switch_enum_type(
table_name='urls',
column_name='outcome',
enum_name='url_status',
new_enum_values=[
'pending',
'submitted',
'validated',
'duplicate',
'rejected',
'error',
],
check_constraints_to_drop=['url_name_not_null_when_validated']
)

op.execute(
"""

Check failure on line 117 in alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py#L117 <122>

continuation line missing indentation or outdented
Raw output
./alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py:117:5: E122 continuation line missing indentation or outdented
ALTER TABLE urls
ADD CONSTRAINT url_name_not_null_when_validated
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
"""
)

switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
]
)
87 changes: 84 additions & 3 deletions collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime, timedelta
from functools import wraps
from operator import or_
from typing import Optional, Type, Any, List

from fastapi import HTTPException
Expand Down Expand Up @@ -29,7 +30,7 @@
RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \
UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \
UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \
BacklogSnapshot, URLDataSource, URLCheckedForDuplicate
BacklogSnapshot, URLDataSource, URLCheckedForDuplicate, URLProbedFor404
from collector_manager.enums import URLStatus, CollectorType
from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo
Expand Down Expand Up @@ -60,6 +61,7 @@
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo
from core.DTOs.task_data_objects.URL404ProbeTDO import URL404ProbeTDO
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo
from core.EnvVarManager import EnvVarManager
Expand Down Expand Up @@ -468,12 +470,14 @@


@session_manager
async def get_pending_urls_without_html_data(self, session: AsyncSession):
async def get_pending_urls_without_html_data(self, session: AsyncSession) -> list[URLInfo]:

Check warning on line 473 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L473 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:473:1: D102 Missing docstring in public method
# TODO: Add test that includes some urls WITH html data. Check they're not returned
statement = self.statement_composer.pending_urls_without_html_data()
statement = statement.limit(100).order_by(URL.id)
scalar_result = await session.scalars(statement)
return scalar_result.all()
results: list[URL] = scalar_result.all()
return DTOConverter.url_list_to_url_info_list(results)


async def get_urls_with_html_data_and_without_models(
self,
Expand Down Expand Up @@ -1343,6 +1347,13 @@
url = raw_result.scalars().first()
return URLInfo(**url.__dict__)

@session_manager
async def get_url_info_by_id(self, session: AsyncSession, url_id: int) -> Optional[URLInfo]:

Check warning on line 1351 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L1351 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:1351:1: D102 Missing docstring in public method
query = Select(URL).where(URL.id == url_id)
raw_result = await session.execute(query)
url = raw_result.scalars().first()
return URLInfo(**url.__dict__)

@session_manager
async def insert_logs(self, session, log_infos: List[LogInfo]):
for log_info in log_infos:
Expand Down Expand Up @@ -2267,8 +2278,78 @@
query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.DUPLICATE.value)
await session.execute(query)

@session_manager
async def mark_all_as_404(self, session: AsyncSession, url_ids: List[int]):

Check warning on line 2282 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2282 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2282:1: D102 Missing docstring in public method
query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.NOT_FOUND.value)
await session.execute(query)

@session_manager
async def mark_all_as_recently_probed_for_404(

Check warning on line 2287 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2287 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2287:1: D102 Missing docstring in public method
self,
session: AsyncSession,
url_ids: List[int],
dt: datetime = func.now()
):
from sqlalchemy.dialects.postgresql import insert as pg_insert
values = [
{"url_id": url_id, "last_probed_at": dt} for url_id in url_ids
]
stmt = pg_insert(URLProbedFor404).values(values)
update_stmt = stmt.on_conflict_do_update(
index_elements=['url_id'],
set_={"last_probed_at": dt}
)
await session.execute(update_stmt)


@session_manager
async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]):
for url_id in url_ids:
url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id)
session.add(url_checked_for_duplicate)

@session_manager
async def has_pending_urls_not_recently_probed_for_404(self, session: AsyncSession) -> bool:

Check warning on line 2312 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2312 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2312:1: D102 Missing docstring in public method
month_ago = func.now() - timedelta(days=30)
query = (
select(
URL.id
).outerjoin(
URLProbedFor404
).where(
and_(
URL.outcome == URLStatus.PENDING.value,
or_(
URLProbedFor404.id == None,

Check failure on line 2323 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2323 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/AsyncDatabaseClient.py:2323:44: E711 comparison to None should be 'if cond is None:'
URLProbedFor404.last_probed_at < month_ago
)
)
).limit(1)
)

raw_result = await session.execute(query)
result = raw_result.one_or_none()
return result is not None

@session_manager
async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSession) -> List[URL404ProbeTDO]:

Check warning on line 2335 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2335 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2335:1: D102 Missing docstring in public method
month_ago = func.now() - timedelta(days=30)
query = (
select(
URL
).outerjoin(
URLProbedFor404
).where(
and_(
URL.outcome == URLStatus.PENDING.value,
or_(
URLProbedFor404.id == None,

Check failure on line 2346 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2346 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/AsyncDatabaseClient.py:2346:44: E711 comparison to None should be 'if cond is None:'
URLProbedFor404.last_probed_at < month_ago
)
)
).limit(100)
)

raw_result = await session.execute(query)
urls = raw_result.scalars().all()
return [URL404ProbeTDO(url=url.url, url_id=url.id) for url in urls]

Check warning on line 2355 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2355 <292>

no newline at end of file
Raw output
./collector_db/AsyncDatabaseClient.py:2355:76: W292 no newline at end of file
19 changes: 19 additions & 0 deletions collector_db/DTOConverter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional

from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType, URLHTMLContentInfo
from collector_db.DTOs.URLInfo import URLInfo
from collector_db.DTOs.URLWithHTML import URLWithHTML
from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent, URL, Agency, \
AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion, \
Expand Down Expand Up @@ -158,6 +159,24 @@
def url_list_to_url_with_html_list(url_list: list[URL]) -> list[URLWithHTML]:
return [DTOConverter.url_to_url_with_html(url) for url in url_list]

@staticmethod
def url_list_to_url_info_list(urls: list[URL]) -> list[URLInfo]:

Check warning on line 163 in collector_db/DTOConverter.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/DTOConverter.py#L163 <102>

Missing docstring in public method
Raw output
./collector_db/DTOConverter.py:163:1: D102 Missing docstring in public method
results = []
for url in urls:
url_info = URLInfo(
id=url.id,
batch_id=url.batch_id,
url=url.url,
collector_metadata=url.collector_metadata,
outcome=url.outcome,
created_at=url.created_at,
updated_at=url.updated_at,
name=url.name
)
results.append(url_info)

return results

@staticmethod
def url_to_url_with_html(url: URL) -> URLWithHTML:
url_val = url.url
Expand Down
1 change: 1 addition & 0 deletions collector_db/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class TaskType(PyEnum):
SUBMIT_APPROVED = "Submit Approved URLs"
DUPLICATE_DETECTION = "Duplicate Detection"
IDLE = "Idle"
PROBE_404 = "404 Probe"

class PGEnum(TypeDecorator):
impl = postgresql.ENUM
Expand Down
16 changes: 16 additions & 0 deletions collector_db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
'rejected',
'duplicate',
'error',
'404 not found',
name='url_status'
),
nullable=False
Expand Down Expand Up @@ -146,6 +147,11 @@
uselist=False,
back_populates="url"
)
probed_for_404 = relationship(
"URLProbedFor404",
uselist=False,
back_populates="url"
)

class URLCheckedForDuplicate(Base):
__tablename__ = 'url_checked_for_duplicate'
Expand All @@ -157,6 +163,16 @@
# Relationships
url = relationship("URL", uselist=False, back_populates="checked_for_duplicate")

class URLProbedFor404(Base):

Check warning on line 166 in collector_db/models.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/models.py#L166 <101>

Missing docstring in public class
Raw output
./collector_db/models.py:166:1: D101 Missing docstring in public class
__tablename__ = 'url_probed_for_404'

id = Column(Integer, primary_key=True)
url_id = Column(Integer, ForeignKey('urls.id'), nullable=False)
last_probed_at = get_created_at_column()

# Relationships
url = relationship("URL", uselist=False, back_populates="probed_for_404")

class URLOptionalDataSourceMetadata(Base):
__tablename__ = 'url_optional_data_source_metadata'

Expand Down
1 change: 1 addition & 0 deletions collector_manager/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ class URLStatus(Enum):
ERROR = "error"
DUPLICATE = "duplicate"
REJECTED = "rejected"
NOT_FOUND = "404 not found"
9 changes: 9 additions & 0 deletions core/DTOs/task_data_objects/URL404ProbeTDO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import Optional

Check warning on line 1 in core/DTOs/task_data_objects/URL404ProbeTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/URL404ProbeTDO.py#L1 <100>

Missing docstring in public module
Raw output
./core/DTOs/task_data_objects/URL404ProbeTDO.py:1:1: D100 Missing docstring in public module

from pydantic import BaseModel


class URL404ProbeTDO(BaseModel):

Check warning on line 6 in core/DTOs/task_data_objects/URL404ProbeTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/URL404ProbeTDO.py#L6 <101>

Missing docstring in public class
Raw output
./core/DTOs/task_data_objects/URL404ProbeTDO.py:6:1: D101 Missing docstring in public class
url_id: int
url: str
is_404: Optional[bool] = None

Check warning on line 9 in core/DTOs/task_data_objects/URL404ProbeTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/URL404ProbeTDO.py#L9 <292>

no newline at end of file
Raw output
./core/DTOs/task_data_objects/URL404ProbeTDO.py:9:34: W292 no newline at end of file
6 changes: 3 additions & 3 deletions core/DTOs/task_data_objects/UrlHtmlTDO.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from dataclasses import dataclass
from typing import Optional

from pydantic import BaseModel

from collector_db.DTOs.URLInfo import URLInfo
from html_tag_collector.DataClassTags import ResponseHTMLInfo
from html_tag_collector.URLRequestInterface import URLResponseInfo


@dataclass
class UrlHtmlTDO:
class UrlHtmlTDO(BaseModel):

Check warning on line 10 in core/DTOs/task_data_objects/UrlHtmlTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/UrlHtmlTDO.py#L10 <101>

Missing docstring in public class
Raw output
./core/DTOs/task_data_objects/UrlHtmlTDO.py:10:1: D101 Missing docstring in public class
url_info: URLInfo
url_response_info: Optional[URLResponseInfo] = None
html_tag_info: Optional[ResponseHTMLInfo] = None
Expand Down
Loading