Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
| `URL_AGENCY_IDENTIFICATION_TASK_FLAG` | Automatically assigns and suggests Agencies for URLs. |
| `URL_SUBMIT_APPROVED_TASK_FLAG` | Submits approved URLs to the Data Sources App. |
| `URL_MISC_METADATA_TASK_FLAG` | Adds misc metadata to URLs. |
| `URL_404_PROBE_TASK_FLAG` | Probes URLs for 404 errors. |
| `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. |
| `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. |
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Remove 404 Probe Task

Revision ID: d55ec2987702
Revises: 25b3fc777c31
Create Date: 2025-10-12 15:49:01.945412

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

Check warning on line 11 in alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py#L11 <401>

'sqlalchemy as sa' imported but unused
Raw output
./alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py:11:1: F401 'sqlalchemy as sa' imported but unused

from src.util.alembic_helpers import remove_enum_value, add_enum_value

# revision identifiers, used by Alembic.
revision: str = 'd55ec2987702'
down_revision: Union[str, None] = '25b3fc777c31'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 22 in alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py#L22 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py:22:1: D103 Missing docstring in public function
_drop_views()
add_enum_value(
enum_name="url_type",
enum_value="broken page"
)

op.execute(
"""DELETE FROM TASKS WHERE task_type = '404 Probe'"""
)
op.execute(
"""DELETE FROM url_task_error WHERE task_type = '404 Probe'"""
)
remove_enum_value(
enum_name="task_type",
value_to_remove="404 Probe",
targets=[
("tasks", "task_type"),
("url_task_error", "task_type")
]
)
op.execute(
"""UPDATE URLS SET status = 'ok' WHERE status = '404 not found'"""
)
remove_enum_value(
enum_name="url_status",
value_to_remove="404 not found",
targets=[
("urls", "status")
]
)

op.drop_table("url_probed_for_404")

_recreate_views()

def _drop_views():
op.execute("drop view url_task_count_1_day")
op.execute("drop view url_task_count_1_week")
op.execute("drop materialized view url_status_mat_view")

def _recreate_views():
op.execute("""
create view url_task_count_1_day(task_type, count) as
SELECT
t.task_type,
count(ltu.url_id) AS count
FROM
tasks t
JOIN link_task_urls ltu
ON ltu.task_id = t.id
WHERE
t.updated_at > (now() - '1 day'::interval)
GROUP BY
t.task_type;
""")

op.execute("""
create view url_task_count_1_week(task_type, count) as
SELECT
t.task_type,
count(ltu.url_id) AS count
FROM
tasks t
JOIN link_task_urls ltu
ON ltu.task_id = t.id
WHERE
t.updated_at > (now() - '7 days'::interval)
GROUP BY
t.task_type;
""")

op.execute(
"""
CREATE MATERIALIZED VIEW url_status_mat_view AS
with
urls_with_relevant_errors as (
select
ute.url_id
from
url_task_error ute
where
ute.task_type in (
'Screenshot',
'HTML',
'URL Probe'
)
)
select
u.id as url_id,
case
when (
-- Validated as not relevant, individual record, or not found
fuv.type in ('not relevant', 'individual record', 'not found')
-- Has Meta URL in data sources app
OR udmu.url_id is not null
-- Has data source in data sources app
OR uds.url_id is not null
) Then 'Submitted/Pipeline Complete'
when fuv.type is not null THEN 'Accepted'
when (
-- Has compressed HTML
uch.url_id is not null
AND
-- Has web metadata
uwm.url_id is not null
AND
-- Has screenshot
us.url_id is not null
) THEN 'Community Labeling'
when uwre.url_id is not null then 'Error'
ELSE 'Intake'
END as status

from
urls u
left join urls_with_relevant_errors uwre
on u.id = uwre.url_id
left join url_screenshot us
on u.id = us.url_id
left join url_compressed_html uch
on u.id = uch.url_id
left join url_web_metadata uwm
on u.id = uwm.url_id
left join flag_url_validated fuv
on u.id = fuv.url_id
left join url_ds_meta_url udmu
on u.id = udmu.url_id
left join url_data_source uds
on u.id = uds.url_id
"""
)

Check failure on line 153 in alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py#L153 <123>

closing bracket does not match indentation of opening bracket's line
Raw output
./alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py:153:9: E123 closing bracket does not match indentation of opening bracket's line


def downgrade() -> None:

Check warning on line 156 in alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py#L156 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_12_1549-d55ec2987702_remove_404_probe_task.py:156:1: D103 Missing docstring in public function
pass
15 changes: 7 additions & 8 deletions src/api/endpoints/review/reject/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ async def run(self, session) -> None:
url = await session.execute(query)
url = url.scalars().first()

validation_type: URLType | None = None
validation_type: URLType
match self.rejection_reason:
case RejectionReason.INDIVIDUAL_RECORD:
validation_type = URLType.INDIVIDUAL_RECORD
case RejectionReason.BROKEN_PAGE_404:
url.status = URLStatus.NOT_FOUND.value
validation_type = URLType.BROKEN_PAGE
case RejectionReason.NOT_RELEVANT:
validation_type = URLType.NOT_RELEVANT
case _:
Expand All @@ -49,12 +49,11 @@ async def run(self, session) -> None:
detail="Invalid rejection reason"
)

if validation_type is not None:
flag_url_validated = FlagURLValidated(
url_id=self.url_id,
type=validation_type
)
session.add(flag_url_validated)
flag_url_validated = FlagURLValidated(
url_id=self.url_id,
type=validation_type
)
session.add(flag_url_validated)

# Add rejecting user
rejecting_user_url = ReviewingUserURL(
Expand Down
1 change: 0 additions & 1 deletion src/collectors/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ class URLStatus(Enum):
OK = "ok"
ERROR = "error"
DUPLICATE = "duplicate"
NOT_FOUND = "404 not found"
11 changes: 0 additions & 11 deletions src/core/tasks/url/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader
from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator
from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator
from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator
from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator
from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier
from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator
Expand Down Expand Up @@ -126,15 +125,6 @@ def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry:
enabled=self.setup_flag("URL_MISC_METADATA_TASK_FLAG")
)

def _get_url_404_probe_task_operator(self) -> URLTaskEntry:
operator = URL404ProbeTaskOperator(
adb_client=self.adb_client,
url_request_interface=self.url_request_interface
)
return URLTaskEntry(
operator=operator,
enabled=self.setup_flag("URL_404_PROBE_TASK_FLAG")
)

def _get_url_auto_relevance_task_operator(self) -> URLTaskEntry:
operator = URLAutoRelevantTaskOperator(
Expand Down Expand Up @@ -220,7 +210,6 @@ async def load_entries(self) -> list[URLTaskEntry]:
self._get_url_root_url_task_operator(),
self._get_url_probe_task_operator(),
self._get_url_html_task_operator(),
self._get_url_404_probe_task_operator(),
self._get_url_record_type_task_operator(),
self._get_agency_identification_task_operator(),
self._get_url_miscellaneous_metadata_task_operator(),
Expand Down
7 changes: 4 additions & 3 deletions src/core/tasks/url/operators/probe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe.tdo import URLProbeTDO
from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic
from src.external.url_request.core import URLRequestInterface
from src.db.client.async_ import AsyncDatabaseClient
from src.db.dtos.url.mapping import URLMapping
Expand Down Expand Up @@ -68,10 +69,10 @@ async def probe_urls(self, tdos: list[URLProbeTDO]) -> None:

async def update_database(self, tdos: list[URLProbeTDO]) -> None:
non_redirect_tdos = filter_non_redirect_tdos(tdos)
web_metadata_objects = convert_tdo_to_web_metadata_list(non_redirect_tdos)
await self.adb_client.bulk_insert(web_metadata_objects)
web_metadata_objects: list[URLWebMetadataPydantic] = convert_tdo_to_web_metadata_list(non_redirect_tdos)
await self.adb_client.bulk_upsert(web_metadata_objects)

redirect_tdos = filter_redirect_tdos(tdos)
redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(tdos)

query_builder = InsertRedirectsQueryBuilder(tdos=redirect_tdos)
await self.adb_client.run_query_builder(query_builder)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


def extract_response_pairs(tdos: list[URLProbeTDO]) -> list[URLProbeRedirectResponsePair]:
results = []
results: list[URLProbeRedirectResponsePair] = []
for tdo in tdos:
if not tdo.response.is_redirect:
raise ValueError(f"Expected {tdo.url_mapping.url} to be a redirect.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from src.core.tasks.url.operators.probe.tdo import URLProbeTDO
from src.db.dtos.url.mapping import URLMapping
from src.db.queries.base.builder import QueryBuilderBase
from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair
from src.external.url_request.probe.models.response import URLProbeResponse
from src.util.url_mapper import URLMapper

Expand All @@ -20,7 +21,7 @@
self.source_url_mappings = [tdo.url_mapping for tdo in self.tdos]
self._mapper = URLMapper(self.source_url_mappings)

self._response_pairs = extract_response_pairs(self.tdos)
self._response_pairs: list[URLProbeRedirectResponsePair] = extract_response_pairs(self.tdos)

self._destination_probe_responses: list[URLProbeResponse] = [
pair.destination
Expand Down Expand Up @@ -49,27 +50,34 @@
session=session
)


# Get all destination URLs already in the database

Check failure on line 54 in src/core/tasks/url/operators/probe/queries/insert_redirects/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe/queries/insert_redirects/query.py#L54 <303>

too many blank lines (2)
Raw output
./src/core/tasks/url/operators/probe/queries/insert_redirects/query.py:54:9: E303 too many blank lines (2)
dest_url_mappings_in_db: list[URLMapping] = await rm.get_url_mappings_in_db(
urls=self._destination_urls
)

# Filter out to only have those URLs that are new in the database
new_dest_urls: list[str] = filter_new_dest_urls(
url_mappings_in_db=dest_url_mappings_in_db,
all_dest_urls=self._destination_urls
)

# Add the new URLs
new_dest_url_mappings: list[URLMapping] = await rm.insert_new_urls(
urls=new_dest_urls
)
all_dest_url_mappings: list[URLMapping] = dest_url_mappings_in_db + new_dest_url_mappings

self._mapper.add_mappings(all_dest_url_mappings)

# Add web metadata for new URLs
await rm.add_web_metadata(
all_dest_url_mappings=all_dest_url_mappings,
dest_url_to_probe_response_mappings=self._destination_url_to_probe_response_mapping,
tdos=self.tdos
)

# Add redirect links for new URLs
await rm.add_redirect_links(
response_pairs=self._response_pairs,
mapper=self._mapper
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import Sequence

Check warning on line 1 in src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py:1:1: D100 Missing docstring in public module

from sqlalchemy import select, tuple_, RowMapping
from sqlalchemy.ext.asyncio import AsyncSession

from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_mappings, \
Expand All @@ -11,6 +14,8 @@
from src.db.dtos.url.mapping import URLMapping
from src.db.helpers.session import session_helper as sh
from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic
from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL
from src.db.models.impl.url.core.sqlalchemy import URL

Check warning on line 18 in src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py#L18 <401>

'src.db.models.impl.url.core.sqlalchemy.URL' imported but unused
Raw output
./src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py:18:1: F401 'src.db.models.impl.url.core.sqlalchemy.URL' imported but unused
from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic
from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair
from src.external.url_request.probe.models.response import URLProbeResponse
Expand Down Expand Up @@ -69,10 +74,40 @@
response_pairs: list[URLProbeRedirectResponsePair],
mapper: URLMapper
) -> None:
links: list[LinkURLRedirectURLPydantic] = []
# Get all existing links and exclude
link_tuples: list[tuple[int, int]] = []
for pair in response_pairs:
source_url_id = mapper.get_id(pair.source.url)
destination_url_id = mapper.get_id(pair.destination.url)
link_tuples.append((source_url_id, destination_url_id))

query = (
select(
LinkURLRedirectURL.source_url_id,
LinkURLRedirectURL.destination_url_id
)
.where(
tuple_(
LinkURLRedirectURL.source_url_id,
LinkURLRedirectURL.destination_url_id
).in_(link_tuples)
)
)
mappings: Sequence[RowMapping] = await sh.mappings(self.session, query=query)
existing_links: set[tuple[int, int]] = {
(mapping["source_url_id"], mapping["destination_url_id"])
for mapping in mappings
}
new_links: list[tuple[int, int]] = [
(source_url_id, destination_url_id)
for source_url_id, destination_url_id in link_tuples
if (source_url_id, destination_url_id) not in existing_links
]


links: list[LinkURLRedirectURLPydantic] = []

Check failure on line 108 in src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py#L108 <303>

too many blank lines (2)
Raw output
./src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py:108:9: E303 too many blank lines (2)
for link in new_links:
source_url_id, destination_url_id = link
link = LinkURLRedirectURLPydantic(
source_url_id=source_url_id,
destination_url_id=destination_url_id
Expand Down
Loading