From a2d6f97444207b8f9e3b14c358dfe1fa84d78d7a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 15 May 2025 16:44:05 -0400 Subject: [PATCH] feat(app): Add 404 Probe Task Additionally update HTML Task to add 404 status to URL where 404s are returned --- ...cb_create_url_probed_for_404_table_and_.py | 137 +++++++++++++++ collector_db/AsyncDatabaseClient.py | 87 +++++++++- collector_db/DTOConverter.py | 19 +++ collector_db/enums.py | 1 + collector_db/models.py | 16 ++ collector_manager/enums.py | 1 + core/DTOs/task_data_objects/URL404ProbeTDO.py | 9 + core/DTOs/task_data_objects/UrlHtmlTDO.py | 6 +- core/TaskManager.py | 9 + .../task_operators/URL404ProbeTaskOperator.py | 63 +++++++ .../task_operators/URLHTMLTaskOperator.py | 31 +++- html_tag_collector/URLRequestInterface.py | 45 +++-- .../test_html_tag_collector_integration.py | 20 ++- .../integration/tasks/test_url_404_probe.py | 161 ++++++++++++++++++ .../integration/tasks/test_url_html_task.py | 34 +++- util/alembic_helpers.py | 8 +- 16 files changed, 614 insertions(+), 33 deletions(-) create mode 100644 alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py create mode 100644 core/DTOs/task_data_objects/URL404ProbeTDO.py create mode 100644 core/classes/task_operators/URL404ProbeTaskOperator.py create mode 100644 tests/test_automated/integration/tasks/test_url_404_probe.py diff --git a/alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py b/alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py new file mode 100644 index 00000000..f8868b02 --- /dev/null +++ b/alembic/versions/2025_05_13_1234-b5f079b6b8cb_create_url_probed_for_404_table_and_.py @@ -0,0 +1,137 @@ +"""Create url_probed_for_404 table and adjust logic for 404 probe + +Revision ID: b5f079b6b8cb +Revises: 864107b703ae +Create Date: 2025-05-13 12:34:46.846656 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = 'b5f079b6b8cb' +down_revision: Union[str, None] = '864107b703ae' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'url_probed_for_404', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('url_id', sa.Integer(), nullable=False), + sa.Column('last_probed_at', sa.DateTime(), nullable=False, server_default=sa.text('now()')), + ) + + # Add unique constraint to url_id column + op.create_unique_constraint('uq_url_probed_for_404_url_id', 'url_probed_for_404', ['url_id']) + # Add unique constraint for url_id column in url_checked_for_duplicate table + op.create_unique_constraint('uq_url_checked_for_duplicates_url_id', 'url_checked_for_duplicate', ['url_id']) + + # Create new `404 Not Found` URL Status + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=[ + 'pending', + 'submitted', + 'validated', + 'duplicate', + 'rejected', + 'error', + '404 not found', + ], + check_constraints_to_drop=['url_name_not_null_when_validated'] + ) + + # Add '404 Probe' to TaskType Enum + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe' + ] + ) + + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT url_name_not_null_when_validated + CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status)) + """ + ) + + # Update existing error URLs with an error message of 404 Not Found + op.execute(""" + UPDATE urls + SET outcome = '404 not found' + FROM url_error_info uei + WHERE urls.id = uei.url_id + AND urls.outcome = 'error' + AND uei.error LIKE '%404%'; + """) + + +def downgrade() -> None: + op.drop_table('url_probed_for_404') + + # Drop unique constraint for url_id column in url_checked_for_duplicate table + op.drop_constraint('uq_url_checked_for_duplicates_url_id', 'url_checked_for_duplicate', type_='unique') + + # Drop `404 Not Found` URL Status + op.execute(""" + UPDATE urls + SET outcome = 'error' + WHERE outcome = '404 not found'; + """) + + switch_enum_type( + table_name='urls', + column_name='outcome', + enum_name='url_status', + new_enum_values=[ + 'pending', + 'submitted', + 'validated', + 'duplicate', + 'rejected', + 'error', + ], + check_constraints_to_drop=['url_name_not_null_when_validated'] + ) + + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT url_name_not_null_when_validated + CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status)) + """ + ) + + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + ] + ) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index ac6216d6..2fad609d 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta from functools import wraps +from operator import or_ from typing import Optional, Type, Any, List from fastapi import HTTPException @@ -29,7 +30,7 @@ RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \ UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \ UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \ - BacklogSnapshot, URLDataSource, URLCheckedForDuplicate + BacklogSnapshot, URLDataSource, URLCheckedForDuplicate, URLProbedFor404 from collector_manager.enums import URLStatus, CollectorType from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo @@ -60,6 +61,7 @@ from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo +from core.DTOs.task_data_objects.URL404ProbeTDO import URL404ProbeTDO from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from core.EnvVarManager import EnvVarManager @@ -468,12 +470,14 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL @session_manager - async def get_pending_urls_without_html_data(self, session: AsyncSession): + async def get_pending_urls_without_html_data(self, session: AsyncSession) -> list[URLInfo]: # TODO: Add test that includes some urls WITH html data. Check they're not returned statement = self.statement_composer.pending_urls_without_html_data() statement = statement.limit(100).order_by(URL.id) scalar_result = await session.scalars(statement) - return scalar_result.all() + results: list[URL] = scalar_result.all() + return DTOConverter.url_list_to_url_info_list(results) + async def get_urls_with_html_data_and_without_models( self, @@ -1343,6 +1347,13 @@ async def get_url_info_by_url(self, session: AsyncSession, url: str) -> Optional url = raw_result.scalars().first() return URLInfo(**url.__dict__) + @session_manager + async def get_url_info_by_id(self, session: AsyncSession, url_id: int) -> Optional[URLInfo]: + query = Select(URL).where(URL.id == url_id) + raw_result = await session.execute(query) + url = raw_result.scalars().first() + return URLInfo(**url.__dict__) + @session_manager async def insert_logs(self, session, log_infos: List[LogInfo]): for log_info in log_infos: @@ -2267,8 +2278,78 @@ async def mark_all_as_duplicates(self, session: AsyncSession, url_ids: List[int] query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.DUPLICATE.value) await session.execute(query) + @session_manager + async def mark_all_as_404(self, session: AsyncSession, url_ids: List[int]): + query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.NOT_FOUND.value) + await session.execute(query) + + @session_manager + async def mark_all_as_recently_probed_for_404( + self, + session: AsyncSession, + url_ids: List[int], + dt: datetime = func.now() + ): + from sqlalchemy.dialects.postgresql import insert as pg_insert + values = [ + {"url_id": url_id, "last_probed_at": dt} for url_id in url_ids + ] + stmt = pg_insert(URLProbedFor404).values(values) + update_stmt = stmt.on_conflict_do_update( + index_elements=['url_id'], + set_={"last_probed_at": dt} + ) + await session.execute(update_stmt) + + @session_manager async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]): for url_id in url_ids: url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id) session.add(url_checked_for_duplicate) + + @session_manager + async def has_pending_urls_not_recently_probed_for_404(self, session: AsyncSession) -> bool: + month_ago = func.now() - timedelta(days=30) + query = ( + select( + URL.id + ).outerjoin( + URLProbedFor404 + ).where( + and_( + URL.outcome == URLStatus.PENDING.value, + or_( + URLProbedFor404.id == None, + URLProbedFor404.last_probed_at < month_ago + ) + ) + ).limit(1) + ) + + raw_result = await session.execute(query) + result = raw_result.one_or_none() + return result is not None + + @session_manager + async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSession) -> List[URL404ProbeTDO]: + month_ago = func.now() - timedelta(days=30) + query = ( + select( + URL + ).outerjoin( + URLProbedFor404 + ).where( + and_( + URL.outcome == URLStatus.PENDING.value, + or_( + URLProbedFor404.id == None, + URLProbedFor404.last_probed_at < month_ago + ) + ) + ).limit(100) + ) + + raw_result = await session.execute(query) + urls = raw_result.scalars().all() + return [URL404ProbeTDO(url=url.url, url_id=url.id) for url in urls] \ No newline at end of file diff --git a/collector_db/DTOConverter.py b/collector_db/DTOConverter.py index b43fbbe9..307e8f85 100644 --- a/collector_db/DTOConverter.py +++ b/collector_db/DTOConverter.py @@ -1,6 +1,7 @@ from typing import Optional from collector_db.DTOs.URLHTMLContentInfo import HTMLContentType, URLHTMLContentInfo +from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLWithHTML import URLWithHTML from collector_db.models import AutomatedUrlAgencySuggestion, UserUrlAgencySuggestion, URLHTMLContent, URL, Agency, \ AutoRecordTypeSuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion, AutoRelevantSuggestion, \ @@ -158,6 +159,24 @@ def final_review_annotation_agency_info( def url_list_to_url_with_html_list(url_list: list[URL]) -> list[URLWithHTML]: return [DTOConverter.url_to_url_with_html(url) for url in url_list] + @staticmethod + def url_list_to_url_info_list(urls: list[URL]) -> list[URLInfo]: + results = [] + for url in urls: + url_info = URLInfo( + id=url.id, + batch_id=url.batch_id, + url=url.url, + collector_metadata=url.collector_metadata, + outcome=url.outcome, + created_at=url.created_at, + updated_at=url.updated_at, + name=url.name + ) + results.append(url_info) + + return results + @staticmethod def url_to_url_with_html(url: URL) -> URLWithHTML: url_val = url.url diff --git a/collector_db/enums.py b/collector_db/enums.py index d6b3ec0f..2ea65aef 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -40,6 +40,7 @@ class TaskType(PyEnum): SUBMIT_APPROVED = "Submit Approved URLs" DUPLICATE_DETECTION = "Duplicate Detection" IDLE = "Idle" + PROBE_404 = "404 Probe" class PGEnum(TypeDecorator): impl = postgresql.ENUM diff --git a/collector_db/models.py b/collector_db/models.py index b2a86e9c..de4dedae 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -99,6 +99,7 @@ class URL(Base): 'rejected', 'duplicate', 'error', + '404 not found', name='url_status' ), nullable=False @@ -146,6 +147,11 @@ class URL(Base): uselist=False, back_populates="url" ) + probed_for_404 = relationship( + "URLProbedFor404", + uselist=False, + back_populates="url" + ) class URLCheckedForDuplicate(Base): __tablename__ = 'url_checked_for_duplicate' @@ -157,6 +163,16 @@ class URLCheckedForDuplicate(Base): # Relationships url = relationship("URL", uselist=False, back_populates="checked_for_duplicate") +class URLProbedFor404(Base): + __tablename__ = 'url_probed_for_404' + + id = Column(Integer, primary_key=True) + url_id = Column(Integer, ForeignKey('urls.id'), nullable=False) + last_probed_at = get_created_at_column() + + # Relationships + url = relationship("URL", uselist=False, back_populates="probed_for_404") + class URLOptionalDataSourceMetadata(Base): __tablename__ = 'url_optional_data_source_metadata' diff --git a/collector_manager/enums.py b/collector_manager/enums.py index 5b89ffe2..3e852b24 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -17,3 +17,4 @@ class URLStatus(Enum): ERROR = "error" DUPLICATE = "duplicate" REJECTED = "rejected" + NOT_FOUND = "404 not found" diff --git a/core/DTOs/task_data_objects/URL404ProbeTDO.py b/core/DTOs/task_data_objects/URL404ProbeTDO.py new file mode 100644 index 00000000..f24cd7b3 --- /dev/null +++ b/core/DTOs/task_data_objects/URL404ProbeTDO.py @@ -0,0 +1,9 @@ +from typing import Optional + +from pydantic import BaseModel + + +class URL404ProbeTDO(BaseModel): + url_id: int + url: str + is_404: Optional[bool] = None \ No newline at end of file diff --git a/core/DTOs/task_data_objects/UrlHtmlTDO.py b/core/DTOs/task_data_objects/UrlHtmlTDO.py index 05e9caf2..ca5b8a50 100644 --- a/core/DTOs/task_data_objects/UrlHtmlTDO.py +++ b/core/DTOs/task_data_objects/UrlHtmlTDO.py @@ -1,13 +1,13 @@ -from dataclasses import dataclass from typing import Optional +from pydantic import BaseModel + from collector_db.DTOs.URLInfo import URLInfo from html_tag_collector.DataClassTags import ResponseHTMLInfo from html_tag_collector.URLRequestInterface import URLResponseInfo -@dataclass -class UrlHtmlTDO: +class UrlHtmlTDO(BaseModel): url_info: URLInfo url_response_info: Optional[URLResponseInfo] = None html_tag_info: Optional[ResponseHTMLInfo] = None diff --git a/core/TaskManager.py b/core/TaskManager.py index 1dcc9bb5..0c67e7a8 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,5 +1,6 @@ import logging +from core.classes.task_operators.URL404ProbeTaskOperator import URL404ProbeTaskOperator from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient @@ -104,10 +105,18 @@ async def get_url_duplicate_task_operator(self): ) return operator + async def get_url_404_probe_task_operator(self): + operator = URL404ProbeTaskOperator( + adb_client=self.adb_client, + url_request_interface=self.url_request_interface + ) + return operator + async def get_task_operators(self) -> list[TaskOperatorBase]: return [ await self.get_url_html_task_operator(), await self.get_url_duplicate_task_operator(), + await self.get_url_404_probe_task_operator(), # await self.get_url_relevance_huggingface_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), diff --git a/core/classes/task_operators/URL404ProbeTaskOperator.py b/core/classes/task_operators/URL404ProbeTaskOperator.py new file mode 100644 index 00000000..bcfd7a1d --- /dev/null +++ b/core/classes/task_operators/URL404ProbeTaskOperator.py @@ -0,0 +1,63 @@ +from http import HTTPStatus + +from pydantic import BaseModel + +from collector_db.AsyncDatabaseClient import AsyncDatabaseClient +from collector_db.enums import TaskType +from core.DTOs.task_data_objects.URL404ProbeTDO import URL404ProbeTDO +from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase +from html_tag_collector.URLRequestInterface import URLRequestInterface + + +class URL404ProbeTDOSubsets(BaseModel): + successful: list[URL404ProbeTDO] + is_404: list[URL404ProbeTDO] + + + +class URL404ProbeTaskOperator(TaskOperatorBase): + + def __init__( + self, + url_request_interface: URLRequestInterface, + adb_client: AsyncDatabaseClient, + ): + super().__init__(adb_client) + self.url_request_interface = url_request_interface + + @property + def task_type(self): + return TaskType.DUPLICATE_DETECTION + + async def meets_task_prerequisites(self): + return await self.adb_client.has_pending_urls_not_recently_probed_for_404() + + async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]): + responses = await self.url_request_interface.make_simple_requests( + urls=[tdo.url for tdo in tdos] + ) + for tdo, response in zip(tdos, responses): + if response.status is None: + continue + tdo.is_404 = response.status == HTTPStatus.NOT_FOUND + + + async def inner_task_logic(self): + tdos = await self.get_pending_urls_not_recently_probed_for_404() + url_ids = [task_info.url_id for task_info in tdos] + await self.link_urls_to_task(url_ids=url_ids) + await self.probe_urls_for_404(tdos) + url_ids_404 = [tdo.url_id for tdo in tdos if tdo.is_404] + + await self.update_404s_in_database(url_ids_404) + await self.mark_as_recently_probed_for_404(url_ids) + + async def get_pending_urls_not_recently_probed_for_404(self) -> list[URL404ProbeTDO]: + return await self.adb_client.get_pending_urls_not_recently_probed_for_404() + + async def update_404s_in_database(self, url_ids_404: list[int]): + await self.adb_client.mark_all_as_404(url_ids_404) + + async def mark_as_recently_probed_for_404(self, url_ids: list[int]): + await self.adb_client.mark_all_as_recently_probed_for_404(url_ids) + diff --git a/core/classes/task_operators/URLHTMLTaskOperator.py b/core/classes/task_operators/URLHTMLTaskOperator.py index f6cfa28a..17f383c7 100644 --- a/core/classes/task_operators/URLHTMLTaskOperator.py +++ b/core/classes/task_operators/URLHTMLTaskOperator.py @@ -1,3 +1,5 @@ +from http import HTTPStatus + from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.DTOs.URLInfo import URLInfo @@ -34,7 +36,9 @@ async def inner_task_logic(self): await self.link_urls_to_task(url_ids=url_ids) await self.get_raw_html_data_for_urls(tdos) success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) - await self.update_errors_in_database(error_subset) + non_404_error_subset, is_404_error_subset = await self.separate_error_and_404_subsets(error_subset) + await self.update_errors_in_database(non_404_error_subset) + await self.update_404s_in_database(is_404_error_subset) await self.process_html_data(success_subset) await self.update_html_data_in_database(success_subset) @@ -53,7 +57,7 @@ async def get_pending_urls_without_html_data(self): async def get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]): just_urls = await self.get_just_urls(tdos) - url_response_infos = await self.url_request_interface.make_requests(just_urls) + url_response_infos = await self.url_request_interface.make_requests_with_html(just_urls) for tdto, url_response_info in zip(tdos, url_response_infos): tdto.url_response_info = url_response_info @@ -73,6 +77,29 @@ async def separate_success_and_error_subsets( successful_tdos.append(tdto) return successful_tdos, errored_tdos + async def separate_error_and_404_subsets( + self, + tdos: list[UrlHtmlTDO] + ) -> tuple[ + list[UrlHtmlTDO], # Error + list[UrlHtmlTDO] # 404 + ]: + tdos_error = [] + tdos_404 = [] + for tdo in tdos: + if tdo.url_response_info.status is None: + tdos_error.append(tdo) + continue + if tdo.url_response_info.status == HTTPStatus.NOT_FOUND: + tdos_404.append(tdo) + else: + tdos_error.append(tdo) + return tdos_error, tdos_404 + + async def update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]): + url_ids = [tdo.url_info.id for tdo in tdos_404] + await self.adb_client.mark_all_as_404(url_ids) + async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): error_infos = [] for error_tdo in error_tdos: diff --git a/html_tag_collector/URLRequestInterface.py b/html_tag_collector/URLRequestInterface.py index 20ea1989..fb0eeb9f 100644 --- a/html_tag_collector/URLRequestInterface.py +++ b/html_tag_collector/URLRequestInterface.py @@ -1,21 +1,24 @@ import asyncio +from http import HTTPStatus from typing import Optional -from aiohttp import ClientSession +from aiohttp import ClientSession, ClientResponseError from playwright.async_api import async_playwright from dataclasses import dataclass +from pydantic import BaseModel from tqdm.asyncio import tqdm MAX_CONCURRENCY = 5 -@dataclass -class URLResponseInfo: +class URLResponseInfo(BaseModel): success: bool + status: Optional[HTTPStatus] = None html: Optional[str] = None content_type: Optional[str] = None - exception: Optional[Exception] = None + exception: Optional[str] = None + @dataclass class RequestResources: @@ -23,10 +26,6 @@ class RequestResources: browser: async_playwright semaphore: asyncio.Semaphore = asyncio.Semaphore(MAX_CONCURRENCY) -def ensure_browsers_installed(): - # TODO: Slated for destruction - pass - HTML_CONTENT_TYPE = "text/html" class URLRequestInterface: @@ -39,13 +38,16 @@ async def get_response(self, session: ClientSession, url: str) -> URLResponseInf return URLResponseInfo( success=True, html=text, - content_type=response.headers.get("content-type") + content_type=response.headers.get("content-type"), + status=HTTPStatus(response.status) ) + except ClientResponseError as e: + return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) except Exception as e: print(f"An error occurred while fetching {url}: {e}") - return URLResponseInfo(success=False, exception=e) + return URLResponseInfo(success=False, exception=str(e)) - async def fetch_and_render(self, rr: RequestResources, url: str) -> URLResponseInfo: + async def fetch_and_render(self, rr: RequestResources, url: str) -> Optional[URLResponseInfo]: simple_response = await self.get_response(rr.session, url) if not simple_response.success: return simple_response @@ -53,6 +55,9 @@ async def fetch_and_render(self, rr: RequestResources, url: str) -> URLResponseI if simple_response.content_type != HTML_CONTENT_TYPE: return simple_response + await self.get_dynamic_html_content(rr, url) + + async def get_dynamic_html_content(self, rr, url): # For HTML responses, attempt to load the page to check for dynamic html content async with rr.semaphore: page = await rr.browser.new_page() @@ -60,9 +65,14 @@ async def fetch_and_render(self, rr: RequestResources, url: str) -> URLResponseI await page.goto(url) await page.wait_for_load_state("networkidle") html_content = await page.content() - return URLResponseInfo(success=True, html=html_content, content_type=HTML_CONTENT_TYPE) + return URLResponseInfo( + success=True, + html=html_content, + content_type=HTML_CONTENT_TYPE, + status=HTTPStatus.OK + ) except Exception as e: - return URLResponseInfo(success=False, exception=e) + return URLResponseInfo(success=False, exception=str(e)) finally: await page.close() @@ -75,12 +85,17 @@ async def fetch_urls(self, urls: list[str]) -> list[URLResponseInfo]: results = await tqdm.gather(*tasks) return results - async def make_requests( + async def make_requests_with_html( self, urls: list[str], ) -> list[URLResponseInfo]: - ensure_browsers_installed() return await self.fetch_urls(urls) + async def make_simple_requests(self, urls: list[str]) -> list[URLResponseInfo]: + async with ClientSession() as session: + tasks = [self.get_response(session, url) for url in urls] + results = await tqdm.gather(*tasks) + return results + diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 3ffef203..228f4b68 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -3,7 +3,7 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLInfo import URLInfo from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator -from helpers.DBDataCreator import DBDataCreator +from tests.helpers.DBDataCreator import DBDataCreator from html_tag_collector.ResponseParser import HTMLResponseParser from html_tag_collector.RootURLCache import RootURLCache from html_tag_collector.URLRequestInterface import URLRequestInterface @@ -13,7 +13,8 @@ "https://pdapio.io", "https://pdap.dev", "https://pdap.io/404test", - "https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html" + "https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html", + ] sample_json_data = [ @@ -27,13 +28,24 @@ @pytest.mark.asyncio async def test_get_response(): uri = URLRequestInterface() - results = await uri.make_requests(URLS) + results = await uri.make_requests_with_html(URLS) print(results) @pytest.mark.asyncio async def test_get_response_with_javascript(): uri = URLRequestInterface() - results = await uri.make_requests(URLS) + results = await uri.make_requests_with_html(URLS) + print(results) + + +@pytest.mark.asyncio +async def test_get_response_with_javascript_404(): + uri = URLRequestInterface() + results = await uri.make_requests_with_html( + [ + 'https://data.tempe.gov/apps/tempegov::1-05-feeling-of-safety-in-your-neighborhood-dashboard' + ] + ) print(results) @pytest.mark.asyncio diff --git a/tests/test_automated/integration/tasks/test_url_404_probe.py b/tests/test_automated/integration/tasks/test_url_404_probe.py new file mode 100644 index 00000000..e248363a --- /dev/null +++ b/tests/test_automated/integration/tasks/test_url_404_probe.py @@ -0,0 +1,161 @@ +import types +from http import HTTPStatus + +import pendulum +import pytest +from aiohttp import ClientResponseError, RequestInfo + +from collector_db.models import URLProbedFor404, URL +from collector_manager.enums import URLStatus +from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome +from core.classes.task_operators.URL404ProbeTaskOperator import URL404ProbeTaskOperator +from html_tag_collector.URLRequestInterface import URLResponseInfo, URLRequestInterface +from tests.helpers.DBDataCreator import DBDataCreator +from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters + + +@pytest.mark.asyncio +async def test_url_404_probe_task(db_data_creator: DBDataCreator): + + mock_html_content = "" + mock_content_type = "text/html" + adb_client = db_data_creator.adb_client + + async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseInfo]: + """ + Mock make_simple_requests so that + - the first url returns a 200 + - the second url returns a 404 + - the third url returns a general error + + """ + results = [] + for idx, url in enumerate(urls): + if idx == 1: + results.append( + URLResponseInfo( + success=False, + content_type=mock_content_type, + exception=str(ClientResponseError( + request_info=RequestInfo( + url=url, + method="GET", + real_url=url, + headers={}, + ), + code=HTTPStatus.NOT_FOUND.value, + history=(None,), + )), + status=HTTPStatus.NOT_FOUND + ) + ) + elif idx == 2: + results.append( + URLResponseInfo( + success=False, + exception=str(ValueError("test error")), + content_type=mock_content_type + ) + ) + else: + results.append(URLResponseInfo( + html=mock_html_content, success=True, content_type=mock_content_type)) + return results + + url_request_interface = URLRequestInterface() + url_request_interface.make_simple_requests = types.MethodType(mock_make_simple_requests, url_request_interface) + + operator = URL404ProbeTaskOperator( + url_request_interface=url_request_interface, + adb_client=adb_client + ) + # Check that initially prerequisites aren't met + meets_prereqs = await operator.meets_task_prerequisites() + assert not meets_prereqs + + # Add 4 URLs, 3 pending, 1 error + creation_info = await db_data_creator.batch_v2( + parameters=TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=3, + status=URLStatus.PENDING, + with_html_content=True + ), + TestURLCreationParameters( + count=1, + status=URLStatus.ERROR, + with_html_content=False + ), + ] + ) + ) + + meets_prereqs = await operator.meets_task_prerequisites() + assert meets_prereqs + + # Run task and validate results + run_info = await operator.run_task(task_id=1) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message + + + pending_url_mappings = creation_info.url_creation_infos[URLStatus.PENDING].url_mappings + url_id_success = pending_url_mappings[0].url_id + url_id_404 = pending_url_mappings[1].url_id + url_id_error = pending_url_mappings[2].url_id + + url_id_initial_error = creation_info.url_creation_infos[URLStatus.ERROR].url_mappings[0].url_id + + # Check that URLProbedFor404 has been appropriately populated + probed_for_404_objects: list[URLProbedFor404] = await db_data_creator.adb_client.get_all(URLProbedFor404) + + assert len(probed_for_404_objects) == 3 + assert probed_for_404_objects[0].url_id == url_id_success + assert probed_for_404_objects[1].url_id == url_id_404 + assert probed_for_404_objects[2].url_id == url_id_error + + # Check that the URLs have been updated appropriated + urls: list[URL] = await adb_client.get_all(URL) + + def find_url(url_id: int) -> URL: + for url in urls: + if url.id == url_id: + return url + raise Exception(f"URL with id {url_id} not found") + + assert find_url(url_id_success).outcome == URLStatus.PENDING.value + assert find_url(url_id_404).outcome == URLStatus.NOT_FOUND.value + assert find_url(url_id_error).outcome == URLStatus.PENDING.value + assert find_url(url_id_initial_error).outcome == URLStatus.ERROR.value + + # Check that meets_task_prerequisites now returns False + meets_prereqs = await operator.meets_task_prerequisites() + assert not meets_prereqs + + # Check that meets_task_prerequisites returns True + # After setting the last probed for 404 date to 2 months ago + two_months_ago = pendulum.now().subtract(months=2).naive() + await adb_client.mark_all_as_recently_probed_for_404( + [url_id_404, url_id_error], + dt=two_months_ago + ) + + meets_prereqs = await operator.meets_task_prerequisites() + assert meets_prereqs + + # Run the task and Ensure all but the URL previously marked as 404 have been checked again + run_info = await operator.run_task(task_id=2) + assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message + + probed_for_404_objects: list[URLProbedFor404] = await db_data_creator.adb_client.get_all(URLProbedFor404) + + assert len(probed_for_404_objects) == 3 + assert probed_for_404_objects[0].last_probed_at != two_months_ago + assert probed_for_404_objects[1].last_probed_at == two_months_ago + assert probed_for_404_objects[2].last_probed_at != two_months_ago + + + + + + diff --git a/tests/test_automated/integration/tasks/test_url_html_task.py b/tests/test_automated/integration/tasks/test_url_html_task.py index 4c33016b..aeb0db7f 100644 --- a/tests/test_automated/integration/tasks/test_url_html_task.py +++ b/tests/test_automated/integration/tasks/test_url_html_task.py @@ -1,10 +1,13 @@ import types +from http import HTTPStatus from typing import Optional import pytest +from aiohttp import ClientError, ClientResponseError, RequestInfo from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.enums import TaskType +from collector_manager.enums import URLStatus from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator from tests.helpers.DBDataCreator import DBDataCreator @@ -23,11 +26,31 @@ async def test_url_html_task(db_data_creator: DBDataCreator): async def mock_make_requests(self, urls: list[str]) -> list[URLResponseInfo]: results = [] for idx, url in enumerate(urls): + if idx == 1: + results.append( + URLResponseInfo( + success=False, + content_type=mock_content_type, + exception=str(ClientResponseError( + request_info=RequestInfo( + url=url, + method="GET", + real_url=url, + headers={}, + ), + code=HTTPStatus.NOT_FOUND.value, + history=(None,), + )), + status=HTTPStatus.NOT_FOUND + ) + ) + continue + if idx == 2: results.append( URLResponseInfo( success=False, - exception=ValueError("test error"), + exception=str(ValueError("test error")), content_type=mock_content_type )) else: @@ -49,7 +72,7 @@ async def mock_get_from_cache(self, url: str) -> Optional[str]: # Add mock methods or mock classes url_request_interface = URLRequestInterface() - url_request_interface.make_requests = types.MethodType(mock_make_requests, url_request_interface) + url_request_interface.make_requests_with_html = types.MethodType(mock_make_requests, url_request_interface) mock_root_url_cache = RootURLCache() mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) @@ -94,11 +117,12 @@ async def mock_get_from_cache(self, url: str) -> Optional[str]: assert task_info.url_errors[0].error == "test error" adb = db_data_creator.adb_client - # Check that both success urls have two rows of HTML data + # Check that success url has two rows of HTML data await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) hci = await adb.get_html_content_info(url_id=url_ids[0]) assert len(hci) == 2 - hci = await adb.get_html_content_info(url_id=url_ids[1]) - assert len(hci) == 2 + # Check that 404 url has status of 404 + url_info_404 = await adb.get_url_info_by_id(url_id=url_ids[1]) + assert url_info_404.outcome == URLStatus.NOT_FOUND # Check that errored url has error info diff --git a/util/alembic_helpers.py b/util/alembic_helpers.py index 84cdbfa7..822b4cd9 100644 --- a/util/alembic_helpers.py +++ b/util/alembic_helpers.py @@ -7,7 +7,7 @@ def switch_enum_type( enum_name, new_enum_values, drop_old_enum=True, - cast_dict: dict = None + check_constraints_to_drop: list[str] = None, ): """ Switches an ENUM type in a PostgreSQL column by: @@ -23,6 +23,12 @@ def switch_enum_type( :param drop_old_enum: Whether to drop the old ENUM type. """ + # 1. Drop check constraints that reference the enum + if check_constraints_to_drop is not None: + for constraint in check_constraints_to_drop: + op.execute(f'ALTER TABLE "{table_name}" DROP CONSTRAINT IF EXISTS "{constraint}"') + + # Rename old enum type old_enum_temp_name = f"{enum_name}_old" op.execute(f'ALTER TYPE "{enum_name}" RENAME TO "{old_enum_temp_name}"')