From f0f33c4cc2f5396f3e5d64931e2da77f4f795617 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 30 Jul 2025 17:12:50 -0400 Subject: [PATCH 01/13] Add scraping logic for non-pending URLs --- src/core/tasks/url/operators/url_html/core.py | 8 ++++---- .../queries/get_pending_urls_without_html_data.py | 2 +- src/db/client/async_.py | 6 +++--- src/db/statement_composer.py | 12 ++++++++++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/url_html/core.py index 39a09546..81baf348 100644 --- a/src/core/tasks/url/operators/url_html/core.py +++ b/src/core/tasks/url/operators/url_html/core.py @@ -29,10 +29,10 @@ def task_type(self): return TaskType.HTML async def meets_task_prerequisites(self): - return await self.adb_client.has_pending_urls_without_html_data() + return await self.adb_client.has_non_errored_urls_without_html_data() async def inner_task_logic(self): - tdos = await self.get_pending_urls_without_html_data() + tdos = await self.get_non_errored_urls_without_html_data() url_ids = [task_info.url_info.id for task_info in tdos] await self.link_urls_to_task(url_ids=url_ids) await self.get_raw_html_data_for_urls(tdos) @@ -54,8 +54,8 @@ async def update_database( async def get_just_urls(self, tdos: list[UrlHtmlTDO]): return [task_info.url_info.url for task_info in tdos] - async def get_pending_urls_without_html_data(self): - pending_urls: list[URLInfo] = await self.adb_client.get_pending_urls_without_html_data() + async def get_non_errored_urls_without_html_data(self): + pending_urls: list[URLInfo] = await self.adb_client.get_non_errored_urls_without_html_data() tdos = [ UrlHtmlTDO( url_info=url_info, diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py index ff7f7c10..16ceb4f4 100644 --- a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py @@ -9,7 +9,7 @@ class GetPendingURLsWithoutHTMLDataQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[URLInfo]: - statement = StatementComposer.pending_urls_without_html_data() + statement = StatementComposer.has_non_errored_urls_without_html_data() statement = statement.limit(100).order_by(URL.id) scalar_result = await session.scalars(statement) url_results: list[URL] = scalar_result.all() diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 9f554f87..3d048d35 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -478,8 +478,8 @@ async def add_html_content_infos(self, session: AsyncSession, html_content_infos await self._add_models(session, URLHTMLContent, html_content_infos) @session_manager - async def has_pending_urls_without_html_data(self, session: AsyncSession) -> bool: - statement = self.statement_composer.pending_urls_without_html_data() + async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: + statement = self.statement_composer.has_non_errored_urls_without_html_data() statement = statement.limit(1) scalar_result = await session.scalars(statement) return bool(scalar_result.first()) @@ -520,7 +520,7 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL ) session.add(metadata_object) - async def get_pending_urls_without_html_data(self) -> list[URLInfo]: + async def get_non_errored_urls_without_html_data(self) -> list[URLInfo]: return await self.run_query_builder(GetPendingURLsWithoutHTMLDataQueryBuilder()) async def get_urls_with_html_data_and_without_models( diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 518aafc2..a6f468ee 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -25,7 +25,7 @@ class StatementComposer: """ @staticmethod - def pending_urls_without_html_data() -> Select: + def has_non_errored_urls_without_html_data() -> Select: exclude_subquery = ( select(1). select_from(LinkTaskURL). @@ -39,7 +39,15 @@ def pending_urls_without_html_data() -> Select: outerjoin(URLHTMLContent). where(URLHTMLContent.id == None). where(~exists(exclude_subquery)). - where(URL.outcome == URLStatus.PENDING.value) + where(URL.outcome.in_( + [ + URLStatus.PENDING, + URLStatus.NOT_RELEVANT, + URLStatus.INDIVIDUAL_RECORD, + URLStatus.SUBMITTED, + URLStatus.VALIDATED + ] + )) .options( selectinload(URL.batch) ) From 15e8bee444a961a29bcf28dcee83671467208765 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 31 Jul 2025 15:21:29 -0400 Subject: [PATCH 02/13] Add scraping logic for non pending URLs --- api/main.py | 0 .../scraper/request_interface/core.py | 10 +++++-- start_mirrored_local_app.py | 29 +++++++++++++------ 3 files changed, 27 insertions(+), 12 deletions(-) delete mode 100644 api/main.py diff --git a/api/main.py b/api/main.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py b/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py index f45780cb..25e9a3af 100644 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py +++ b/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py @@ -13,6 +13,13 @@ class URLRequestInterface: async def get_response(self, session: ClientSession, url: str) -> URLResponseInfo: + try: + return await self._execute_get(session, url) + except Exception as e: + print(f"An error occurred while fetching {url}: {e}") + return URLResponseInfo(success=False, exception=str(e)) + + async def _execute_get(self, session, url): try: async with session.get(url, timeout=20) as response: response.raise_for_status() @@ -25,9 +32,6 @@ async def get_response(self, session: ClientSession, url: str) -> URLResponseInf ) except ClientResponseError as e: return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) - except Exception as e: - print(f"An error occurred while fetching {url}: {e}") - return URLResponseInfo(success=False, exception=str(e)) async def fetch_and_render(self, rr: RequestResources, url: str) -> Optional[URLResponseInfo]: simple_response = await self.get_response(rr.session, url) diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index 5199fba2..e2bd10e3 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -27,15 +27,8 @@ def main(): # Check cache if exists and checker = TimestampChecker() data_dump_container = docker_manager.run_container(data_dumper_docker_info) - if checker.last_run_within_24_hours(): - print("Last run within 24 hours, skipping dump...") - else: - data_dump_container.run_command( - DUMP_SH_DOCKER_PATH, - ) - data_dump_container.run_command( - RESTORE_SH_DOCKER_PATH, - ) + _run_dump_if_longer_than_24_hours(checker, data_dump_container) + _run_database_restore(data_dump_container) print("Stopping datadumper container") data_dump_container.stop() checker.set_last_run_time() @@ -44,6 +37,10 @@ def main(): apply_migrations() # Run `fastapi dev main.py` + _run_fast_api(docker_manager) + + +def _run_fast_api(docker_manager: DockerManager) -> None: try: uvicorn.run( "src.api.main:app", @@ -59,8 +56,22 @@ def main(): print("Containers stopped.") +def _run_database_restore(data_dump_container) -> None: + data_dump_container.run_command( + RESTORE_SH_DOCKER_PATH, + ) +def _run_dump_if_longer_than_24_hours( + checker, + data_dump_container +): + if checker.last_run_within_24_hours(): + print("Last run within 24 hours, skipping dump...") + else: + data_dump_container.run_command( + DUMP_SH_DOCKER_PATH, + ) if __name__ == "__main__": From e92cd6699604c39e5a3c960366a3c960b989d131 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 1 Aug 2025 08:11:16 -0400 Subject: [PATCH 03/13] Clean up logic, refactor URL Requests Interface, begin setting up probe task --- ...-99eceed6e614_add_web_status_info_table.py | 99 +++++++++++++++++++ .../agency/get/queries/next_for_annotation.py | 2 +- .../annotate/dtos/shared/base/response.py | 2 +- src/api/endpoints/review/next/dto.py | 2 +- src/api/endpoints/review/next/query.py | 2 +- src/api/main.py | 6 +- src/core/tasks/url/loader.py | 14 +-- .../__init__.py | 0 .../{url_duplicate => duplicate}/core.py | 2 +- .../{url_duplicate => duplicate}/tdo.py | 0 .../queries => html}/__init__.py | 0 .../{url_html => html}/content_info_getter.py | 2 +- .../url/operators/{url_html => html}/core.py | 8 +- .../queries}/__init__.py | 0 .../get_pending_urls_without_html_data.py | 0 .../{url_html => html}/scraper/README.md | 0 .../scraper}/__init__.py | 0 .../scraper/parser/README.md | 0 .../scraper/parser}/__init__.py | 0 .../scraper/parser/constants.py | 0 .../{url_html => html}/scraper/parser/core.py | 10 +- .../scraper/parser/dtos}/__init__.py | 0 .../scraper/parser/dtos/response_html.py | 0 .../scraper/parser/enums.py | 0 .../scraper/parser/mapping.py | 0 .../{url_html => html}/scraper/parser/util.py | 4 +- .../scraper/root_url_cache}/__init__.py | 0 .../scraper/root_url_cache/constants.py | 0 .../scraper/root_url_cache/core.py | 4 +- .../scraper/root_url_cache/dtos}/__init__.py | 0 .../scraper/root_url_cache/dtos/response.py | 0 .../url/operators/{url_html => html}/tdo.py | 4 +- .../parser/dtos => misc_metadata}/__init__.py | 0 .../core.py | 2 +- .../queries}/__init__.py | 0 ...pending_urls_missing_miscellaneous_data.py | 2 +- ...pending_urls_missing_miscellaneous_data.py | 0 .../tdo.py | 0 .../dtos => probe}/__init__.py | 0 src/core/tasks/url/operators/probe/core.py | 62 ++++++++++++ .../queries}/__init__.py | 0 .../url/operators/probe/queries/get_urls.py | 31 ++++++ .../url/operators/probe/queries/has_urls.py | 27 +++++ src/core/tasks/url/operators/probe/tdo.py | 9 ++ .../dtos => probe_404}/__init__.py | 0 .../{url_404_probe => probe_404}/core.py | 4 +- .../{url_404_probe => probe_404}/tdo.py | 0 .../__init__.py | 0 .../core.py | 2 +- .../queries/__init__.py | 0 .../queries/get.py | 2 +- .../queries/has_validated.py | 0 .../queries/mark_submitted.py | 2 +- .../tdo.py | 0 .../scraper/request_interface/core.py | 84 ---------------- .../miscellaneous_metadata/auto_googler.py | 2 +- .../subtasks/miscellaneous_metadata/base.py | 2 +- .../subtasks/miscellaneous_metadata/ckan.py | 2 +- .../miscellaneous_metadata/muckrock.py | 2 +- src/db/client/async_.py | 32 ++++-- src/db/client/sync.py | 2 +- src/db/dto_converter.py | 4 +- src/db/dtos/url/mapping.py | 1 + src/db/enums.py | 1 + src/db/helpers/session/session_helper.py | 4 + .../url/web_metadata/__init__.py | 0 .../url/web_metadata/sqlalchemy.py | 33 +++++++ src/external/pdap/client.py | 2 +- .../url_request}/README.md | 0 src/external/url_request/__init__.py | 0 .../url_request}/constants.py | 0 src/external/url_request/core.py | 21 ++++ src/external/url_request/dtos/__init__.py | 0 .../url_request}/dtos/request_resources.py | 2 +- .../url_request}/dtos/url_response.py | 0 src/external/url_request/probe/__init__.py | 0 src/external/url_request/probe/core.py | 43 ++++++++ src/external/url_request/probe/format.py | 32 ++++++ src/external/url_request/probe/model.py | 15 +++ src/external/url_request/request.py | 91 +++++++++++++++++ .../integration/api/test_annotate.py | 2 +- .../html_tag_collector/test_root_url_cache.py | 4 +- .../url/duplicate/test_url_duplicate_task.py | 2 +- .../tasks/url/html/mocks/methods.py | 4 +- .../integration/tasks/url/html/setup.py | 8 +- .../test_submit_approved_url_task.py | 2 +- .../tasks/url/test_url_404_probe.py | 6 +- .../test_url_miscellaneous_metadata_task.py | 2 +- tests/conftest.py | 2 +- .../data_creator/commands/impl/urls.py | 2 +- tests/helpers/data_creator/core.py | 4 +- .../core/tasks/test_url_html_task_operator.py | 10 +- tests/manual/external/url_request/__init__.py | 0 .../external/url_request/test_url_probe.py | 22 +++++ .../test_html_tag_collector_integration.py | 8 +- 95 files changed, 587 insertions(+), 170 deletions(-) create mode 100644 alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py rename src/core/tasks/url/operators/{submit_approved_url => duplicate}/__init__.py (100%) rename src/core/tasks/url/operators/{url_duplicate => duplicate}/core.py (95%) rename src/core/tasks/url/operators/{url_duplicate => duplicate}/tdo.py (100%) rename src/core/tasks/url/operators/{submit_approved_url/queries => html}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/content_info_getter.py (90%) rename src/core/tasks/url/operators/{url_html => html}/core.py (94%) rename src/core/tasks/url/operators/{url_404_probe => html/queries}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/queries/get_pending_urls_without_html_data.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/README.md (100%) rename src/core/tasks/url/operators/{url_duplicate => html/scraper}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/README.md (100%) rename src/core/tasks/url/operators/{url_html => html/scraper/parser}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/constants.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/core.py (89%) rename src/core/tasks/url/operators/{url_html/queries => html/scraper/parser/dtos}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/dtos/response_html.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/enums.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/mapping.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/parser/util.py (84%) rename src/core/tasks/url/operators/{url_html/scraper => html/scraper/root_url_cache}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/root_url_cache/constants.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/root_url_cache/core.py (92%) rename src/core/tasks/url/operators/{url_html/scraper/parser => html/scraper/root_url_cache/dtos}/__init__.py (100%) rename src/core/tasks/url/operators/{url_html => html}/scraper/root_url_cache/dtos/response.py (100%) rename src/core/tasks/url/operators/{url_html => html}/tdo.py (57%) rename src/core/tasks/url/operators/{url_html/scraper/parser/dtos => misc_metadata}/__init__.py (100%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => misc_metadata}/core.py (96%) rename src/core/tasks/url/operators/{url_html/scraper/request_interface => misc_metadata/queries}/__init__.py (100%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => misc_metadata}/queries/get_pending_urls_missing_miscellaneous_data.py (93%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => misc_metadata}/queries/has_pending_urls_missing_miscellaneous_data.py (100%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => misc_metadata}/tdo.py (100%) rename src/core/tasks/url/operators/{url_html/scraper/request_interface/dtos => probe}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/probe/core.py rename src/core/tasks/url/operators/{url_html/scraper/root_url_cache => probe/queries}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/probe/queries/get_urls.py create mode 100644 src/core/tasks/url/operators/probe/queries/has_urls.py create mode 100644 src/core/tasks/url/operators/probe/tdo.py rename src/core/tasks/url/operators/{url_html/scraper/root_url_cache/dtos => probe_404}/__init__.py (100%) rename src/core/tasks/url/operators/{url_404_probe => probe_404}/core.py (92%) rename src/core/tasks/url/operators/{url_404_probe => probe_404}/tdo.py (100%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => submit_approved}/__init__.py (100%) rename src/core/tasks/url/operators/{submit_approved_url => submit_approved}/core.py (96%) rename src/core/tasks/url/operators/{url_miscellaneous_metadata => submit_approved}/queries/__init__.py (100%) rename src/core/tasks/url/operators/{submit_approved_url => submit_approved}/queries/get.py (96%) rename src/core/tasks/url/operators/{submit_approved_url => submit_approved}/queries/has_validated.py (100%) rename src/core/tasks/url/operators/{submit_approved_url => submit_approved}/queries/mark_submitted.py (93%) rename src/core/tasks/url/operators/{submit_approved_url => submit_approved}/tdo.py (100%) delete mode 100644 src/core/tasks/url/operators/url_html/scraper/request_interface/core.py create mode 100644 src/db/models/instantiations/url/web_metadata/__init__.py create mode 100644 src/db/models/instantiations/url/web_metadata/sqlalchemy.py rename src/{core/tasks/url/operators/url_html/scraper/request_interface => external/url_request}/README.md (100%) create mode 100644 src/external/url_request/__init__.py rename src/{core/tasks/url/operators/url_html/scraper/request_interface => external/url_request}/constants.py (100%) create mode 100644 src/external/url_request/core.py create mode 100644 src/external/url_request/dtos/__init__.py rename src/{core/tasks/url/operators/url_html/scraper/request_interface => external/url_request}/dtos/request_resources.py (74%) rename src/{core/tasks/url/operators/url_html/scraper/request_interface => external/url_request}/dtos/url_response.py (100%) create mode 100644 src/external/url_request/probe/__init__.py create mode 100644 src/external/url_request/probe/core.py create mode 100644 src/external/url_request/probe/format.py create mode 100644 src/external/url_request/probe/model.py create mode 100644 src/external/url_request/request.py create mode 100644 tests/manual/external/url_request/__init__.py create mode 100644 tests/manual/external/url_request/test_url_probe.py diff --git a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py new file mode 100644 index 00000000..0b69cc90 --- /dev/null +++ b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py @@ -0,0 +1,99 @@ +"""Add HTML Status Info table + +Revision ID: 99eceed6e614 +Revises: 637de6eaa3ab +Create Date: 2025-07-31 15:36:40.966605 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, created_at_column, updated_at_column, url_id_column, switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '99eceed6e614' +down_revision: Union[str, None] = '637de6eaa3ab' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +WEB_STATUS_ENUM = sa.Enum( + "not_attempted", + "success", + "error", + "404_not_found", + name="web_status" +) + +TABLE_NAME = 'url_web_metadata' + +def _add_url_probe_task_type_enum() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe' + ] + ) + +def _drop_url_probe_task_type_enum() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face' + ] + ) + +def _create_url_html_info_table() -> None: + op.create_table( + TABLE_NAME, + id_column(), + url_id_column(), + sa.Column('accessed', sa.Boolean(), nullable=False), + sa.Column('status_code', sa.Integer(), nullable=False), + sa.Column('content_type', sa.Text(), nullable=True), + sa.Column('error_message', sa.Text(), nullable=True), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_url_web_status_info_url_id'), + sa.CheckConstraint('status_code >= 100', name='ck_url_web_status_info_status_code_min'), + sa.CheckConstraint('status_code <= 999', name='ck_url_web_status_info_status_code_max'), + ) + +def _drop_url_html_info_table() -> None: + op.drop_table(TABLE_NAME) + + +def upgrade() -> None: + _create_url_html_info_table() + + +def downgrade() -> None: + _drop_url_html_info_table() + # Drop Enums + WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 27f7a382..66a5e3fb 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -7,7 +7,7 @@ from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus -from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info +from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index a7e30385..1e9fc5fa 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.mapping import URLMapping diff --git a/src/api/endpoints/review/next/dto.py b/src/api/endpoints/review/next/dto.py index 7fc53b17..a9c378b9 100644 --- a/src/api/endpoints/review/next/dto.py +++ b/src/api/endpoints/review/next/dto.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.core.enums import RecordType, SuggestedStatus -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo class FinalReviewAnnotationRelevantInfo(BaseModel): diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 0ec83dc1..d89aa4da 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -7,7 +7,7 @@ from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info +from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo diff --git a/src/api/main.py b/src/api/main.py index 46ae4a3a..e9916724 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -26,11 +26,11 @@ from src.core.tasks.scheduled.manager import AsyncScheduledTaskManager from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.external.huggingface.hub.client import HuggingFaceHubClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 50ff8920..f54ff025 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,15 +7,15 @@ from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier -from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator -from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator -from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/operators/submit_approved_url/__init__.py b/src/core/tasks/url/operators/duplicate/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/__init__.py rename to src/core/tasks/url/operators/duplicate/__init__.py diff --git a/src/core/tasks/url/operators/url_duplicate/core.py b/src/core/tasks/url/operators/duplicate/core.py similarity index 95% rename from src/core/tasks/url/operators/url_duplicate/core.py rename to src/core/tasks/url/operators/duplicate/core.py index ed3d00a5..dba0147c 100644 --- a/src/core/tasks/url/operators/url_duplicate/core.py +++ b/src/core/tasks/url/operators/duplicate/core.py @@ -4,7 +4,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.core.tasks.url.operators.url_duplicate.tdo import URLDuplicateTDO +from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/operators/url_duplicate/tdo.py b/src/core/tasks/url/operators/duplicate/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_duplicate/tdo.py rename to src/core/tasks/url/operators/duplicate/tdo.py diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/__init__.py b/src/core/tasks/url/operators/html/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/queries/__init__.py rename to src/core/tasks/url/operators/html/__init__.py diff --git a/src/core/tasks/url/operators/url_html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py similarity index 90% rename from src/core/tasks/url/operators/url_html/content_info_getter.py rename to src/core/tasks/url/operators/html/content_info_getter.py index 644e12e4..d861e265 100644 --- a/src/core/tasks/url/operators/url_html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/html/core.py similarity index 94% rename from src/core/tasks/url/operators/url_html/core.py rename to src/core/tasks/url/operators/html/core.py index 81baf348..ff6cb3b1 100644 --- a/src/core/tasks/url/operators/url_html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -5,11 +5,11 @@ from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType -from src.core.tasks.url.operators.url_html.tdo import UrlHtmlTDO -from src.core.tasks.url.operators.url_html.content_info_getter import HTMLContentInfoGetter +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.core.tasks.url.operators.html.content_info_getter import HTMLContentInfoGetter from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface class URLHTMLTaskOperator(URLTaskOperatorBase): diff --git a/src/core/tasks/url/operators/url_404_probe/__init__.py b/src/core/tasks/url/operators/html/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_404_probe/__init__.py rename to src/core/tasks/url/operators/html/queries/__init__.py diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py similarity index 100% rename from src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py rename to src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py diff --git a/src/core/tasks/url/operators/url_html/scraper/README.md b/src/core/tasks/url/operators/html/scraper/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/README.md rename to src/core/tasks/url/operators/html/scraper/README.md diff --git a/src/core/tasks/url/operators/url_duplicate/__init__.py b/src/core/tasks/url/operators/html/scraper/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_duplicate/__init__.py rename to src/core/tasks/url/operators/html/scraper/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/README.md b/src/core/tasks/url/operators/html/scraper/parser/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/README.md rename to src/core/tasks/url/operators/html/scraper/parser/README.md diff --git a/src/core/tasks/url/operators/url_html/__init__.py b/src/core/tasks/url/operators/html/scraper/parser/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/__init__.py rename to src/core/tasks/url/operators/html/scraper/parser/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/constants.py b/src/core/tasks/url/operators/html/scraper/parser/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/constants.py rename to src/core/tasks/url/operators/html/scraper/parser/constants.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/core.py b/src/core/tasks/url/operators/html/scraper/parser/core.py similarity index 89% rename from src/core/tasks/url/operators/url_html/scraper/parser/core.py rename to src/core/tasks/url/operators/html/scraper/parser/core.py index 737f03dd..a212b951 100644 --- a/src/core/tasks/url/operators/url_html/scraper/parser/core.py +++ b/src/core/tasks/url/operators/html/scraper/parser/core.py @@ -3,11 +3,11 @@ from bs4 import BeautifulSoup -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.parser.enums import ParserTypeEnum -from src.core.tasks.url.operators.url_html.scraper.parser.constants import HEADER_TAGS -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.url_html.scraper.parser.util import remove_excess_whitespace, add_https, remove_trailing_backslash, \ +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.enums import ParserTypeEnum +from src.core.tasks.url.operators.html.scraper.parser.constants import HEADER_TAGS +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.parser.util import remove_excess_whitespace, add_https, remove_trailing_backslash, \ drop_hostname diff --git a/src/core/tasks/url/operators/url_html/queries/__init__.py b/src/core/tasks/url/operators/html/scraper/parser/dtos/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/queries/__init__.py rename to src/core/tasks/url/operators/html/scraper/parser/dtos/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/dtos/response_html.py b/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/dtos/response_html.py rename to src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/enums.py b/src/core/tasks/url/operators/html/scraper/parser/enums.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/enums.py rename to src/core/tasks/url/operators/html/scraper/parser/enums.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/mapping.py b/src/core/tasks/url/operators/html/scraper/parser/mapping.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/mapping.py rename to src/core/tasks/url/operators/html/scraper/parser/mapping.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/util.py b/src/core/tasks/url/operators/html/scraper/parser/util.py similarity index 84% rename from src/core/tasks/url/operators/url_html/scraper/parser/util.py rename to src/core/tasks/url/operators/html/scraper/parser/util.py index 09453984..a4ea2d1b 100644 --- a/src/core/tasks/url/operators/url_html/scraper/parser/util.py +++ b/src/core/tasks/url/operators/html/scraper/parser/util.py @@ -1,8 +1,8 @@ from urllib.parse import urlparse from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo def convert_to_response_html_info(html_content_infos: list[URLHTMLContentInfo]): diff --git a/src/core/tasks/url/operators/url_html/scraper/__init__.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/__init__.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/constants.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/constants.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py similarity index 92% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/core.py index c30bc16e..284ad678 100644 --- a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py +++ b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py @@ -5,8 +5,8 @@ from bs4 import BeautifulSoup from src.db.client.async_ import AsyncDatabaseClient -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.constants import REQUEST_HEADERS -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo +from src.core.tasks.url.operators.html.scraper.root_url_cache.constants import REQUEST_HEADERS +from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo DEBUG = False diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/__init__.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/__init__.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/response.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/response.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py diff --git a/src/core/tasks/url/operators/url_html/tdo.py b/src/core/tasks/url/operators/html/tdo.py similarity index 57% rename from src/core/tasks/url/operators/url_html/tdo.py rename to src/core/tasks/url/operators/html/tdo.py index 326412a3..a098ee02 100644 --- a/src/core/tasks/url/operators/url_html/tdo.py +++ b/src/core/tasks/url/operators/html/tdo.py @@ -2,9 +2,9 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.models.instantiations.url.core.pydantic import URLInfo -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo +from src.external.url_request.dtos.url_response import URLResponseInfo class UrlHtmlTDO(BaseModel): diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/dtos/__init__.py b/src/core/tasks/url/operators/misc_metadata/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/dtos/__init__.py rename to src/core/tasks/url/operators/misc_metadata/__init__.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py b/src/core/tasks/url/operators/misc_metadata/core.py similarity index 96% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/core.py rename to src/core/tasks/url/operators/misc_metadata/core.py index 446c32c4..9921846b 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py +++ b/src/core/tasks/url/operators/misc_metadata/core.py @@ -4,7 +4,7 @@ from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.subtasks.miscellaneous_metadata.auto_googler import AutoGooglerMiscMetadataSubtask from src.core.tasks.url.subtasks.miscellaneous_metadata.ckan import CKANMiscMetadataSubtask diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/__init__.py b/src/core/tasks/url/operators/misc_metadata/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/__init__.py rename to src/core/tasks/url/operators/misc_metadata/queries/__init__.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py similarity index 93% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py rename to src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index e5add9ce..e87fcaac 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -2,7 +2,7 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import CollectorType -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from src.db.dtos.url.html_content import HTMLContentType from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/has_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/has_pending_urls_missing_miscellaneous_data.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/has_pending_urls_missing_miscellaneous_data.py rename to src/core/tasks/url/operators/misc_metadata/queries/has_pending_urls_missing_miscellaneous_data.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/tdo.py b/src/core/tasks/url/operators/misc_metadata/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/tdo.py rename to src/core/tasks/url/operators/misc_metadata/tdo.py diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/__init__.py b/src/core/tasks/url/operators/probe/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/__init__.py rename to src/core/tasks/url/operators/probe/__init__.py diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py new file mode 100644 index 00000000..3891955f --- /dev/null +++ b/src/core/tasks/url/operators/probe/core.py @@ -0,0 +1,62 @@ +from typing import final +from typing_extensions import override + +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.external.url_request.core import URLRequestInterface +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.enums import TaskType + +@final +class URLProbeTaskOperator(URLTaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + url_request_interface: URLRequestInterface + ): + super().__init__(adb_client=adb_client) + self.url_request_interface = url_request_interface + + + @property + @override + def task_type(self): + return TaskType.PROBE_URL + + @override + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.has_urls_without_probe() + + async def get_urls_without_probe(self) -> list[URLProbeTDO]: + url_mappings: list[URLMapping] = await self.adb_client.get_urls_without_probe() + return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] + + @override + async def inner_task_logic(self): + tdos = await self.get_urls_without_probe() + url_ids = [task_info.url_id for task_info in tdos] + await self.link_urls_to_task(url_ids=url_ids) + + responses = await self.probe_urls(tdos) + await self.update_database(tdos, responses) + + async def probe_urls(self, tdos: list[URLProbeTDO]): + """Probe URLs and add responses to URLProbeTDO + + Modifies: + URLProbeTDO.response + """ + url_to_tdo: dict[str, URLProbeTDO] = { + tdo.url_mapping.url: tdo for tdo in tdos + } + responses = await self.url_request_interface.probe_urls( + urls=[tdo.url_mapping.url for tdo in tdos] + ) + # Re-associate the responses with the URL mappings + for response in responses: + tdo = url_to_tdo[response.url] + tdo.response = response + + diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/__init__.py b/src/core/tasks/url/operators/probe/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/__init__.py rename to src/core/tasks/url/operators/probe/queries/__init__.py diff --git a/src/core/tasks/url/operators/probe/queries/get_urls.py b/src/core/tasks/url/operators/probe/queries/get_urls.py new file mode 100644 index 00000000..b24071fd --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/get_urls.py @@ -0,0 +1,31 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import UrlWebMetadata +from src.db.helpers.session import session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +@final +class GetURLsWithoutProbeQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + select( + URL.id.label("url_id"), + URL.url + ) + .outerjoin( + UrlWebMetadata, + URL.id == UrlWebMetadata.url_id + ) + .where( + UrlWebMetadata.id.is_(None) + ) + ) + db_mappings = await sh.mappings(session, query=query) + return [URLMapping(**mapping) for mapping in db_mappings] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/has_urls.py b/src/core/tasks/url/operators/probe/queries/has_urls.py new file mode 100644 index 00000000..1f60230f --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/has_urls.py @@ -0,0 +1,27 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import UrlWebMetadata +from src.db.queries.base.builder import QueryBuilderBase + +@final +class HasURLsWithoutProbeQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> bool: + query = ( + select( + URL.id + ) + .outerjoin( + UrlWebMetadata, + URL.id == UrlWebMetadata.url_id + ) + .where( + UrlWebMetadata.id.is_(None) + ) + ) + return await sh.has_results(session, query=query) diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py new file mode 100644 index 00000000..8af513c1 --- /dev/null +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.external.url_request.probe.model import URLProbeResponse +from src.db.dtos.url.mapping import URLMapping + + +class URLProbeTDO(BaseModel): + url_mapping: URLMapping + response: URLProbeResponse | None = None diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/__init__.py b/src/core/tasks/url/operators/probe_404/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/__init__.py rename to src/core/tasks/url/operators/probe_404/__init__.py diff --git a/src/core/tasks/url/operators/url_404_probe/core.py b/src/core/tasks/url/operators/probe_404/core.py similarity index 92% rename from src/core/tasks/url/operators/url_404_probe/core.py rename to src/core/tasks/url/operators/probe_404/core.py index 7da96068..6600d17d 100644 --- a/src/core/tasks/url/operators/url_404_probe/core.py +++ b/src/core/tasks/url/operators/probe_404/core.py @@ -2,10 +2,10 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO +from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.core.tasks.url.operators.url_404_probe.tdo import URL404ProbeTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/url_404_probe/tdo.py b/src/core/tasks/url/operators/probe_404/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_404_probe/tdo.py rename to src/core/tasks/url/operators/probe_404/tdo.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/__init__.py b/src/core/tasks/url/operators/submit_approved/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/__init__.py rename to src/core/tasks/url/operators/submit_approved/__init__.py diff --git a/src/core/tasks/url/operators/submit_approved_url/core.py b/src/core/tasks/url/operators/submit_approved/core.py similarity index 96% rename from src/core/tasks/url/operators/submit_approved_url/core.py rename to src/core/tasks/url/operators/submit_approved/core.py index d2e20c3a..e6b1be9f 100644 --- a/src/core/tasks/url/operators/submit_approved_url/core.py +++ b/src/core/tasks/url/operators/submit_approved/core.py @@ -1,7 +1,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/__init__.py b/src/core/tasks/url/operators/submit_approved/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/__init__.py rename to src/core/tasks/url/operators/submit_approved/queries/__init__.py diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py similarity index 96% rename from src/core/tasks/url/operators/submit_approved_url/queries/get.py rename to src/core/tasks/url/operators/submit_approved/queries/get.py index ea40ce79..db128326 100644 --- a/src/core/tasks/url/operators/submit_approved_url/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/queries/has_validated.py rename to src/core/tasks/url/operators/submit_approved/queries/has_validated.py diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py similarity index 93% rename from src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py rename to src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index 9c68ec21..347fba11 100644 --- a/src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/submit_approved_url/tdo.py b/src/core/tasks/url/operators/submit_approved/tdo.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/tdo.py rename to src/core/tasks/url/operators/submit_approved/tdo.py diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py b/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py deleted file mode 100644 index 25e9a3af..00000000 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py +++ /dev/null @@ -1,84 +0,0 @@ -from http import HTTPStatus -from typing import Optional - -from aiohttp import ClientSession, ClientResponseError -from playwright.async_api import async_playwright -from tqdm.asyncio import tqdm - -from src.core.tasks.url.operators.url_html.scraper.request_interface.constants import HTML_CONTENT_TYPE -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.request_resources import RequestResources -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo - - -class URLRequestInterface: - - async def get_response(self, session: ClientSession, url: str) -> URLResponseInfo: - try: - return await self._execute_get(session, url) - except Exception as e: - print(f"An error occurred while fetching {url}: {e}") - return URLResponseInfo(success=False, exception=str(e)) - - async def _execute_get(self, session, url): - try: - async with session.get(url, timeout=20) as response: - response.raise_for_status() - text = await response.text() - return URLResponseInfo( - success=True, - html=text, - content_type=response.headers.get("content-type"), - status=HTTPStatus(response.status) - ) - except ClientResponseError as e: - return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) - - async def fetch_and_render(self, rr: RequestResources, url: str) -> Optional[URLResponseInfo]: - simple_response = await self.get_response(rr.session, url) - if not simple_response.success: - return simple_response - - if simple_response.content_type != HTML_CONTENT_TYPE: - return simple_response - - return await self.get_dynamic_html_content(rr, url) - - async def get_dynamic_html_content(self, rr, url): - # For HTML responses, attempt to load the page to check for dynamic html content - async with rr.semaphore: - page = await rr.browser.new_page() - try: - await page.goto(url) - await page.wait_for_load_state("networkidle") - html_content = await page.content() - return URLResponseInfo( - success=True, - html=html_content, - content_type=HTML_CONTENT_TYPE, - status=HTTPStatus.OK - ) - except Exception as e: - return URLResponseInfo(success=False, exception=str(e)) - finally: - await page.close() - - async def fetch_urls(self, urls: list[str]) -> list[URLResponseInfo]: - async with ClientSession() as session: - async with async_playwright() as playwright: - browser = await playwright.chromium.launch(headless=True) - request_resources = RequestResources(session=session, browser=browser) - tasks = [self.fetch_and_render(request_resources, url) for url in urls] - results = await tqdm.gather(*tasks) - return results - - async def make_requests_with_html( - self, - urls: list[str], - ) -> list[URLResponseInfo]: - return await self.fetch_urls(urls) - - async def make_simple_requests(self, urls: list[str]) -> list[URLResponseInfo]: - async with ClientSession() as session: - tasks = [self.get_response(session, url) for url in urls] - results = await tqdm.gather(*tasks) - return results diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py index 0f183f78..e060d0d3 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py index 7b38504d..3ca7357b 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO class MiscellaneousMetadataSubtaskBase(ABC): diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py index 90512e2b..ef60b48c 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py index bb3eaadf..18a749b7 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 72b13f18..93f3dbea 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -77,19 +77,21 @@ HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.get import GetValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.has_validated import HasValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.core.tasks.url.operators.url_404_probe.tdo import URL404ProbeTDO -from src.core.tasks.url.operators.url_duplicate.tdo import URLDuplicateTDO -from src.core.tasks.url.operators.url_html.queries.get_pending_urls_without_html_data import \ +from src.core.tasks.url.operators.probe.queries.get_urls import GetURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe.queries.has_urls import HasURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO +from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO +from src.core.tasks.url.operators.html.queries.get_pending_urls_without_html_data import \ GetPendingURLsWithoutHTMLDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ +from src.core.tasks.url.operators.misc_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ GetPendingURLsMissingMiscellaneousDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ +from src.core.tasks.url.operators.misc_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager @@ -1571,3 +1573,13 @@ async def check_valid_urls_updated(self) -> bool: async def get_current_database_time(self) -> datetime: return await self.scalar(select(func.now())) + + async def has_urls_without_probe(self) -> bool: + return await self.run_query_builder( + HasURLsWithoutProbeQueryBuilder() + ) + + async def get_urls_without_probe(self) -> list[URLMapping]: + return await self.run_query_builder( + GetURLsWithoutProbeQueryBuilder() + ) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 866feb25..e2d21705 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -20,7 +20,7 @@ from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index ed2d361c..4f21c8c2 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -5,8 +5,8 @@ from src.api.endpoints.review.next.dto import FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, \ FinalReviewAnnotationAgencyAutoInfo, FinalReviewAnnotationAgencyInfo from src.core.enums import RecordType, SuggestionType -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo from src.db.dtos.url.with_html import URLWithHTML from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping.py index 38efbce4..18fc5be2 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping.py @@ -2,5 +2,6 @@ class URLMapping(BaseModel): + """Mapping between url and url_id.""" url: str url_id: int diff --git a/src/db/enums.py b/src/db/enums.py index 6c1d1496..c8ed9840 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -44,6 +44,7 @@ class TaskType(PyEnum): SYNC_AGENCIES = "Sync Agencies" SYNC_DATA_SOURCES = "Sync Data Sources" PUSH_TO_HUGGINGFACE = "Push to Hugging Face" + PROBE_URL = "URL Probe" class ChangeLogOperationType(PyEnum): INSERT = "INSERT" diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 2b3776c1..9736cd9e 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -43,6 +43,10 @@ async def mappings(session: AsyncSession, query: sa.Select) -> Sequence[sa.RowMa raw_result = await session.execute(query) return raw_result.mappings().all() +async def has_results(session: AsyncSession, query: sa.Select) -> bool: + raw_result = await session.execute(query) + return raw_result.first() is not None + async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], diff --git a/src/db/models/instantiations/url/web_metadata/__init__.py b/src/db/models/instantiations/url/web_metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py new file mode 100644 index 00000000..dd2f8391 --- /dev/null +++ b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py @@ -0,0 +1,33 @@ +from sqlalchemy import Column, Text, Boolean, Integer + +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates import StandardBase + + +class UrlWebMetadata( + StandardBase, + URLDependentMixin, + CreatedAtMixin, + UpdatedAtMixin +): + """Contains information about the web page.""" + __tablename__ = "url_web_metadata" + + accessed = Column( + Boolean(), + nullable=False + ) + status_code = Column( + Integer(), + nullable=False + ) + content_type = Column( + Text(), + nullable=True + ) + error_message = Column( + Text(), + nullable=True + ) + + diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 1447ae87..ee442600 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -4,7 +4,7 @@ from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/README.md b/src/external/url_request/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/README.md rename to src/external/url_request/README.md diff --git a/src/external/url_request/__init__.py b/src/external/url_request/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/constants.py b/src/external/url_request/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/constants.py rename to src/external/url_request/constants.py diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py new file mode 100644 index 00000000..e2143bcc --- /dev/null +++ b/src/external/url_request/core.py @@ -0,0 +1,21 @@ +from aiohttp import ClientSession + +from src.external.url_request.dtos.url_response import URLResponseInfo +from src.external.url_request.probe.core import URLProbeManager +from src.external.url_request.probe.model import URLProbeResponse +from src.external.url_request.request import fetch_urls + + +class URLRequestInterface: + + @staticmethod + async def make_requests_with_html( + urls: list[str], + ) -> list[URLResponseInfo]: + return await fetch_urls(urls) + + @staticmethod + async def probe_urls(urls: list[str]) -> list[URLProbeResponse]: + async with ClientSession() as session: + manager = URLProbeManager(session=session) + return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/dtos/__init__.py b/src/external/url_request/dtos/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py b/src/external/url_request/dtos/request_resources.py similarity index 74% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py rename to src/external/url_request/dtos/request_resources.py index 62ad714a..01a5365f 100644 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py +++ b/src/external/url_request/dtos/request_resources.py @@ -4,7 +4,7 @@ from aiohttp import ClientSession from playwright.async_api import async_playwright -from src.core.tasks.url.operators.url_html.scraper.request_interface.constants import MAX_CONCURRENCY +from src.external.url_request.constants import MAX_CONCURRENCY @dataclass diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/url_response.py b/src/external/url_request/dtos/url_response.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/url_response.py rename to src/external/url_request/dtos/url_response.py diff --git a/src/external/url_request/probe/__init__.py b/src/external/url_request/probe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py new file mode 100644 index 00000000..b15286d3 --- /dev/null +++ b/src/external/url_request/probe/core.py @@ -0,0 +1,43 @@ +import asyncio + +from aiohttp import ClientSession, ClientResponseError + +from src.external.url_request.probe.format import format_client_response, format_client_response_error, format_error +from src.external.url_request.probe.model import URLProbeResponse + + +class URLProbeManager: + + def __init__( + self, + session: ClientSession + ): + self.session = session + + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: + return await asyncio.gather(*[self.probe_url(url) for url in urls]) + + async def probe_url(self, url: str) -> URLProbeResponse: + result = await self.head(url) + if result.error is None: + return result + return await self.get(url) + + + async def head(self, url: str) -> URLProbeResponse: + try: + async with self.session.head(url) as response: + return format_client_response(url, response=response) + except ClientResponseError as e: + return format_client_response_error(url, error=e) + except Exception as e: + return format_error(url, error=e) + + async def get(self, url: str) -> URLProbeResponse: + try: + async with self.session.get(url) as response: + return format_client_response(url, response=response) + except ClientResponseError as e: + return format_client_response_error(url, error=e) + except Exception as e: + return format_error(url, error=e) \ No newline at end of file diff --git a/src/external/url_request/probe/format.py b/src/external/url_request/probe/format.py new file mode 100644 index 00000000..65430c1e --- /dev/null +++ b/src/external/url_request/probe/format.py @@ -0,0 +1,32 @@ +from aiohttp import ClientResponse, ClientResponseError + +from src.external.url_request.probe.model import URLProbeResponse + + +def format_content_type(content_type: str) -> str: + return content_type.split(";")[0].strip() + +def format_client_response(url: str, response: ClientResponse) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=response.status, + content_type=format_content_type( + response.headers.get("content-type") + ) + ) + +def format_client_response_error(url: str, error: ClientResponseError) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=error.status, + content_type=None, + error=str(error) + ) + +def format_error(url: str, error: Exception) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=None, + content_type=None, + error=str(error) + ) \ No newline at end of file diff --git a/src/external/url_request/probe/model.py b/src/external/url_request/probe/model.py new file mode 100644 index 00000000..6ddff60e --- /dev/null +++ b/src/external/url_request/probe/model.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel, model_validator + + +class URLProbeResponse(BaseModel): + url: str + status_code: int | None + content_type: str | None + error: str | None = None + + @model_validator(mode='after') + def check_error_mutually_exclusive_with_status_and_content(self): + if self.error is not None: + if self.status_code is not None or self.content_type is not None: + raise ValueError('Error is mutually exclusive with status code and content type') + return self diff --git a/src/external/url_request/request.py b/src/external/url_request/request.py new file mode 100644 index 00000000..40fc2dd6 --- /dev/null +++ b/src/external/url_request/request.py @@ -0,0 +1,91 @@ +"""Functions for making HTTP requests.""" +from http import HTTPStatus + +from aiohttp import ClientSession, ClientResponseError +from playwright.async_api import async_playwright +from tqdm.asyncio import tqdm + +from src.external.url_request.constants import HTML_CONTENT_TYPE +from src.external.url_request.dtos.request_resources import RequestResources + +from src.external.url_request.dtos.url_response import URLResponseInfo + + +async def execute_get( + session: ClientSession, + url: str +) -> URLResponseInfo: + try: + async with session.get(url, timeout=20) as response: + response.raise_for_status() + text = await response.text() + return URLResponseInfo( + success=True, + html=text, + content_type=response.headers.get("content-type"), + status=HTTPStatus(response.status) + ) + except ClientResponseError as e: + return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) + + +async def get_response(session: ClientSession, url: str) -> URLResponseInfo: + try: + return await execute_get(session, url) + except Exception as e: + print(f"An error occurred while fetching {url}: {e}") + return URLResponseInfo(success=False, exception=str(e)) + + +async def make_simple_requests(urls: list[str]) -> list[URLResponseInfo]: + async with ClientSession() as session: + tasks = [get_response(session, url) for url in urls] + results = await tqdm.gather(*tasks) + return results + + +async def get_dynamic_html_content( + rr: RequestResources, + url: str +) -> URLResponseInfo | None: + # For HTML responses, attempt to load the page to check for dynamic html content + async with rr.semaphore: + page = await rr.browser.new_page() + try: + await page.goto(url) + await page.wait_for_load_state("networkidle") + html_content = await page.content() + return URLResponseInfo( + success=True, + html=html_content, + content_type=HTML_CONTENT_TYPE, + status=HTTPStatus.OK + ) + except Exception as e: + return URLResponseInfo(success=False, exception=str(e)) + finally: + await page.close() + + +async def fetch_and_render( + rr: RequestResources, + url: str +) -> URLResponseInfo | None: + simple_response = await get_response(rr.session, url) + if not simple_response.success: + return simple_response + + if simple_response.content_type != HTML_CONTENT_TYPE: + return simple_response + + return await get_dynamic_html_content(rr, url) + + +async def fetch_urls(urls: list[str]) -> list[URLResponseInfo]: + async with ClientSession() as session: + async with async_playwright() as playwright: + browser = await playwright.chromium.launch(headless=True) + request_resources = RequestResources(session=session, browser=browser) + tasks = [fetch_and_render(request_resources, url) for url in urls] + results = await tqdm.gather(*tasks) + return results diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py index 690b83e4..78dd0f55 100644 --- a/tests/automated/integration/api/test_annotate.py +++ b/tests/automated/integration/api/test_annotate.py @@ -9,7 +9,7 @@ from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/tests/automated/integration/html_tag_collector/test_root_url_cache.py b/tests/automated/integration/html_tag_collector/test_root_url_cache.py index 151985cf..0add726e 100644 --- a/tests/automated/integration/html_tag_collector/test_root_url_cache.py +++ b/tests/automated/integration/html_tag_collector/test_root_url_cache.py @@ -1,7 +1,7 @@ import pytest -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo async def mock_get_request(url: str) -> RootURLCacheResponseInfo: diff --git a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py index bd66e409..2f4e64b5 100644 --- a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py +++ b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py @@ -3,7 +3,7 @@ import pytest -from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator +from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/url/html/mocks/methods.py b/tests/automated/integration/tasks/url/html/mocks/methods.py index dd623ee8..ddf1fc6f 100644 --- a/tests/automated/integration/tasks/url/html/mocks/methods.py +++ b/tests/automated/integration/tasks/url/html/mocks/methods.py @@ -3,8 +3,8 @@ from aiohttp import ClientResponseError, RequestInfo -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.external.url_request.dtos.url_response import URLResponseInfo from tests.automated.integration.tasks.url.html.mocks.constants import MOCK_CONTENT_TYPE, MOCK_HTML_CONTENT diff --git a/tests/automated/integration/tasks/url/html/setup.py b/tests/automated/integration/tasks/url/html/setup.py index e6a4de81..2d6a47a7 100644 --- a/tests/automated/integration/tasks/url/html/setup.py +++ b/tests/automated/integration/tasks/url/html/setup.py @@ -1,10 +1,10 @@ import types -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.tasks.url.html.mocks.methods import mock_make_requests, mock_get_from_cache, mock_parse diff --git a/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py index 8e27908b..ce9861e0 100644 --- a/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py @@ -1,7 +1,7 @@ import pytest from deepdiff import DeepDiff -from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 54592640..2022a8f3 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -5,13 +5,13 @@ import pytest from aiohttp import ClientResponseError, RequestInfo -from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator +from src.external.url_request.core import URLRequestInterface from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo +from src.external.url_request.dtos.url_response import URLResponseInfo from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters diff --git a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py index ed7f1336..6e95fccb 100644 --- a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py @@ -2,7 +2,7 @@ import pytest -from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator +from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType diff --git a/tests/conftest.py b/tests/conftest.py index f26249cd..3d9cebc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,7 +127,7 @@ def db_data_creator( db_data_creator = DBDataCreator(db_client=db_client_test) yield db_data_creator -@pytest.fixture +@pytest_asyncio.fixture async def test_client_session() -> AsyncGenerator[ClientSession, Any]: async with ClientSession() as session: yield session diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls.py index daec2445..82324042 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls.py @@ -1,7 +1,7 @@ from datetime import datetime from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index f86e9a25..d0a951f8 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -19,8 +19,8 @@ from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType, URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters diff --git a/tests/manual/core/tasks/test_url_html_task_operator.py b/tests/manual/core/tasks/test_url_html_task_operator.py index f4cc36d6..b6031d77 100644 --- a/tests/manual/core/tasks/test_url_html_task_operator.py +++ b/tests/manual/core/tasks/test_url_html_task_operator.py @@ -1,12 +1,10 @@ -from unittest.mock import patch - import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO, ManualBatchInnerInputDTO -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache @pytest.mark.asyncio diff --git a/tests/manual/external/url_request/__init__.py b/tests/manual/external/url_request/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/external/url_request/test_url_probe.py b/tests/manual/external/url_request/test_url_probe.py new file mode 100644 index 00000000..75396746 --- /dev/null +++ b/tests/manual/external/url_request/test_url_probe.py @@ -0,0 +1,22 @@ +import pytest + +from src.external.url_request.probe import URLProbeManager + +URLS = [ + "https://www.google.com", + "https://www.example.com", + "https://www.example.org", + "https://www.nonexistent.com", +] + +@pytest.mark.asyncio +async def test_url_probe_head(test_client_session): + manager = URLProbeManager(session=test_client_session) + result = await manager.head(url=URLS[0]) + print(result) + +@pytest.mark.asyncio +async def test_url_probe(test_client_session): + manager = URLProbeManager(session=test_client_session) + results = await manager.probe_urls(urls=URLS) + print(results) \ No newline at end of file diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index ef8f0df3..857def21 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -1,9 +1,9 @@ import pytest -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.core.pydantic import URLInfo from tests.helpers.data_creator.core import DBDataCreator From 20f1f9bfce96d544970978202c706a4637199e00 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 1 Aug 2025 08:51:21 -0400 Subject: [PATCH 04/13] Finish draft of Probe Task logic --- ...-99eceed6e614_add_web_status_info_table.py | 2 +- src/core/tasks/url/operators/probe/core.py | 31 ++++++++++++++----- .../url/operators/probe/queries/insert.py | 15 +++++++++ src/db/client/async_.py | 9 ++++++ .../url/web_metadata/pydantic.py | 9 ++++++ .../url/web_metadata/sqlalchemy.py | 2 +- 6 files changed, 58 insertions(+), 10 deletions(-) create mode 100644 src/core/tasks/url/operators/probe/queries/insert.py create mode 100644 src/db/models/instantiations/url/web_metadata/pydantic.py diff --git a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py index 0b69cc90..077d8277 100644 --- a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py +++ b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py @@ -75,7 +75,7 @@ def _create_url_html_info_table() -> None: id_column(), url_id_column(), sa.Column('accessed', sa.Boolean(), nullable=False), - sa.Column('status_code', sa.Integer(), nullable=False), + sa.Column('status_code', sa.Integer(), nullable=True), sa.Column('content_type', sa.Text(), nullable=True), sa.Column('error_message', sa.Text(), nullable=True), created_at_column(), diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py index 3891955f..98d4f8ab 100644 --- a/src/core/tasks/url/operators/probe/core.py +++ b/src/core/tasks/url/operators/probe/core.py @@ -3,6 +3,7 @@ from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping import URLMapping @@ -22,7 +23,7 @@ def __init__( @property @override - def task_type(self): + def task_type(self) -> TaskType: return TaskType.PROBE_URL @override @@ -34,15 +35,15 @@ async def get_urls_without_probe(self) -> list[URLProbeTDO]: return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] @override - async def inner_task_logic(self): + async def inner_task_logic(self) -> None: tdos = await self.get_urls_without_probe() - url_ids = [task_info.url_id for task_info in tdos] - await self.link_urls_to_task(url_ids=url_ids) - - responses = await self.probe_urls(tdos) - await self.update_database(tdos, responses) + await self.link_urls_to_task( + url_ids=[tdo.url_mapping.url_id for tdo in tdos] + ) + await self.probe_urls(tdos) + await self.update_database(tdos) - async def probe_urls(self, tdos: list[URLProbeTDO]): + async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: """Probe URLs and add responses to URLProbeTDO Modifies: @@ -59,4 +60,18 @@ async def probe_urls(self, tdos: list[URLProbeTDO]): tdo = url_to_tdo[response.url] tdo.response = response + async def update_database(self, tdos: list[URLProbeTDO]) -> None: + web_metadata_objects: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response = tdo.response + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=response.status_code is not None, + status_code=response.status_code, + content_type=response.content_type, + error_message=response.error + ) + web_metadata_objects.append(web_metadata_object) + await self.adb_client.bulk_insert(web_metadata_objects) + diff --git a/src/core/tasks/url/operators/probe/queries/insert.py b/src/core/tasks/url/operators/probe/queries/insert.py new file mode 100644 index 00000000..2b312e36 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert.py @@ -0,0 +1,15 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.queries.base.builder import QueryBuilderBase + +@final +class InsertURLMetadataInfoQueryBuilder(QueryBuilderBase): + + def __init__( + self, + + ): + + @override + async def run(self, session: AsyncSession) -> None: diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93f3dbea..9242194b 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -144,6 +144,7 @@ GetMetricsURLSAggregatedPendingQueryBuilder from src.db.statement_composer import StatementComposer from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo @@ -238,6 +239,14 @@ async def bulk_delete( ): return await sh.bulk_delete(session, models) + @session_manager + async def bulk_insert( + self, + session: AsyncSession, + models: list[BulkInsertableModel], + ): + return await sh.bulk_insert(session, models) + @session_manager async def scalar(self, session: AsyncSession, statement): """Fetch the first column of the first row.""" diff --git a/src/db/models/instantiations/url/web_metadata/pydantic.py b/src/db/models/instantiations/url/web_metadata/pydantic.py new file mode 100644 index 00000000..e46a60b9 --- /dev/null +++ b/src/db/models/instantiations/url/web_metadata/pydantic.py @@ -0,0 +1,9 @@ +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLWebMetadataPydantic(BulkInsertableModel): + url_id: int + accessed: bool + status_code: int | None + content_type: str | None + error_message: str | None \ No newline at end of file diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py index dd2f8391..48beb4b4 100644 --- a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py +++ b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py @@ -19,7 +19,7 @@ class UrlWebMetadata( ) status_code = Column( Integer(), - nullable=False + nullable=True ) content_type = Column( Text(), From 0c8c5ebf4ee2cdc71da542917e0714fabf6c93f0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 1 Aug 2025 09:17:55 -0400 Subject: [PATCH 05/13] Begin draft of test logic --- .../integration/tasks/url/probe/__init__.py | 0 .../integration/tasks/url/probe/conftest.py | 0 .../integration/tasks/url/probe/setup/__init__.py | 0 .../integration/tasks/url/probe/setup/manager.py | 12 ++++++++++++ .../tasks/url/probe/setup/models/__init__.py | 0 .../tasks/url/probe/setup/models/entry.py | 11 +++++++++++ .../tasks/url/probe/setup/models/planned_response.py | 7 +++++++ .../integration/tasks/url/probe/test_core.py | 0 8 files changed, 30 insertions(+) create mode 100644 tests/automated/integration/tasks/url/probe/__init__.py create mode 100644 tests/automated/integration/tasks/url/probe/conftest.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/__init__.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/manager.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/models/__init__.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/models/entry.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/models/planned_response.py create mode 100644 tests/automated/integration/tasks/url/probe/test_core.py diff --git a/tests/automated/integration/tasks/url/probe/__init__.py b/tests/automated/integration/tasks/url/probe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/conftest.py b/tests/automated/integration/tasks/url/probe/conftest.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/__init__.py b/tests/automated/integration/tasks/url/probe/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/manager.py b/tests/automated/integration/tasks/url/probe/setup/manager.py new file mode 100644 index 00000000..9b5bb48b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/manager.py @@ -0,0 +1,12 @@ +from tests.helpers.data_creator.core import DBDataCreator + + +class TestURLProbeTaskSetupManager: + + def __init__( + self, + db_data_creator: DBDataCreator + ): + self.db_data_creator = db_data_creator + + async def setup(self): diff --git a/tests/automated/integration/tasks/url/probe/setup/models/__init__.py b/tests/automated/integration/tasks/url/probe/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/models/entry.py b/tests/automated/integration/tasks/url/probe/setup/models/entry.py new file mode 100644 index 00000000..b39487ef --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/models/entry.py @@ -0,0 +1,11 @@ +from pydantic import model_validator + +from src.collectors.enums import URLStatus +from src.external.url_request.probe.model import URLProbeResponse +from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse + + +class TestURLProbeTaskEntry: + url: str + url_status: URLStatus + url_probe_response: URLProbePlannedResponse diff --git a/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py b/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py new file mode 100644 index 00000000..41f17883 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class URLProbePlannedResponse(BaseModel): + status_code: int | None + content_type: str | None + error: str | None \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/test_core.py b/tests/automated/integration/tasks/url/probe/test_core.py new file mode 100644 index 00000000..e69de29b From 24f2cacf824c7e34fff31159886272d2533aabe3 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 1 Aug 2025 14:25:21 -0400 Subject: [PATCH 06/13] Finish tests for URL Probe --- src/api/endpoints/batch/urls/dto.py | 2 +- src/api/endpoints/batch/urls/query.py | 2 +- src/api/endpoints/task/by_id/dto.py | 2 +- src/api/endpoints/task/by_id/query.py | 2 +- src/core/preprocessors/autogoogler.py | 2 +- src/core/preprocessors/base.py | 2 +- src/core/preprocessors/ckan.py | 2 +- src/core/preprocessors/common_crawler.py | 2 +- src/core/preprocessors/example.py | 2 +- src/core/preprocessors/muckrock.py | 2 +- .../agency_identification/dtos/suggestion.py | 12 ++-- src/core/tasks/url/operators/html/core.py | 2 +- .../get_pending_urls_without_html_data.py | 2 +- src/core/tasks/url/operators/html/tdo.py | 2 +- .../url/operators/probe/queries/get_urls.py | 8 +-- .../url/operators/probe/queries/has_urls.py | 8 +-- src/db/client/async_.py | 61 ++++++++++++------- src/db/client/sync.py | 2 +- .../instantiations/url/core/pydantic.py | 17 ------ .../url/core/pydantic/__init__.py | 0 .../instantiations/url/core/pydantic/info.py | 17 ++++++ .../url/core/pydantic/insert.py | 19 ++++++ .../url/web_metadata/pydantic.py | 9 +++ .../url/web_metadata/sqlalchemy.py | 2 +- src/external/url_request/probe/model.py | 15 +++-- .../db/client/test_delete_url_updated_at.py | 2 +- .../integration/db/client/test_insert_urls.py | 2 +- .../happy_path/test_happy_path.py | 1 - .../integration/tasks/url/probe/conftest.py | 15 +++++ .../integration/tasks/url/probe/constants.py | 3 + .../integration/tasks/url/probe/setup/core.py | 22 +++++++ .../integration/tasks/url/probe/setup/data.py | 36 +++++++++++ .../tasks/url/probe/setup/format.py | 24 ++++++++ .../tasks/url/probe/setup/manager.py | 12 ---- .../tasks/url/probe/setup/mocks/__init__.py | 0 .../url/probe/setup/mocks/probe_manager.py | 20 ++++++ .../tasks/url/probe/setup/models/entry.py | 5 +- .../tasks/url/probe/setup/queries/__init__.py | 0 .../tasks/url/probe/setup/queries/check.py | 43 +++++++++++++ .../integration/tasks/url/probe/test_core.py | 33 ++++++++++ .../test_autogoogler_collector.py | 2 +- .../test_common_crawl_collector.py | 2 +- .../test_muckrock_collectors.py | 2 +- .../data_creator/commands/impl/urls.py | 2 +- tests/helpers/data_creator/core.py | 16 +---- .../test_html_tag_collector_integration.py | 2 +- 46 files changed, 332 insertions(+), 108 deletions(-) delete mode 100644 src/db/models/instantiations/url/core/pydantic.py create mode 100644 src/db/models/instantiations/url/core/pydantic/__init__.py create mode 100644 src/db/models/instantiations/url/core/pydantic/info.py create mode 100644 src/db/models/instantiations/url/core/pydantic/insert.py create mode 100644 tests/automated/integration/tasks/url/probe/constants.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/core.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/data.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/format.py delete mode 100644 tests/automated/integration/tasks/url/probe/setup/manager.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/queries/__init__.py create mode 100644 tests/automated/integration/tasks/url/probe/setup/queries/check.py diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 13e8659c..90f9b209 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 49b95e13..980b4c81 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,8 +1,8 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index 9213aa90..eba6cece 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -3,8 +3,8 @@ from pydantic import BaseModel +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.enums import TaskType from src.core.enums import BatchStatus diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index 8133085f..c2b32234 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.db.models.instantiations.task.core import Task diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index 460cf0e0..b41eba76 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class AutoGooglerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index beb31cb7..2f777d5f 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index b72ee3c9..0b1cef2e 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 16f5d730..d2f0d988 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class CommonCrawlerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index 691d23c6..580b739e 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,8 +1,8 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class ExamplePreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index b42a198f..b0f1d9bc 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class MuckrockPreprocessor(PreprocessorBase): diff --git a/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py b/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py index f42ecfc2..39f2cab3 100644 --- a/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py +++ b/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py @@ -8,9 +8,9 @@ class URLAgencySuggestionInfo(BaseModel): url_id: int suggestion_type: SuggestionType = SuggestionType.UNKNOWN - pdap_agency_id: Optional[int] = None - agency_name: Optional[str] = None - state: Optional[str] = None - county: Optional[str] = None - locality: Optional[str] = None - user_id: Optional[int] = None + pdap_agency_id: int | None = None + agency_name: str | None = None + state: str | None = None + county: str | None = None + locality: str | None = None + user_id: int | None = None diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py index ff6cb3b1..25927e08 100644 --- a/src/core/tasks/url/operators/html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -1,8 +1,8 @@ from http import HTTPStatus from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO diff --git a/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py index 16ceb4f4..d09f8bca 100644 --- a/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/html/tdo.py b/src/core/tasks/url/operators/html/tdo.py index a098ee02..98bd12da 100644 --- a/src/core/tasks/url/operators/html/tdo.py +++ b/src/core/tasks/url/operators/html/tdo.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.external.url_request.dtos.url_response import URLResponseInfo diff --git a/src/core/tasks/url/operators/probe/queries/get_urls.py b/src/core/tasks/url/operators/probe/queries/get_urls.py index b24071fd..9df9191f 100644 --- a/src/core/tasks/url/operators/probe/queries/get_urls.py +++ b/src/core/tasks/url/operators/probe/queries/get_urls.py @@ -4,7 +4,7 @@ from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import UrlWebMetadata +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase @@ -20,11 +20,11 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: URL.url ) .outerjoin( - UrlWebMetadata, - URL.id == UrlWebMetadata.url_id + URLWebMetadata, + URL.id == URLWebMetadata.url_id ) .where( - UrlWebMetadata.id.is_(None) + URLWebMetadata.id.is_(None) ) ) db_mappings = await sh.mappings(session, query=query) diff --git a/src/core/tasks/url/operators/probe/queries/has_urls.py b/src/core/tasks/url/operators/probe/queries/has_urls.py index 1f60230f..1ae7835b 100644 --- a/src/core/tasks/url/operators/probe/queries/has_urls.py +++ b/src/core/tasks/url/operators/probe/queries/has_urls.py @@ -4,7 +4,7 @@ from src.db.helpers.session import session_helper as sh from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import UrlWebMetadata +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.base.builder import QueryBuilderBase @final @@ -17,11 +17,11 @@ async def run(self, session: AsyncSession) -> bool: URL.id ) .outerjoin( - UrlWebMetadata, - URL.id == UrlWebMetadata.url_id + URLWebMetadata, + URL.id == URLWebMetadata.url_id ) .where( - UrlWebMetadata.id.is_(None) + URLWebMetadata.id.is_(None) ) ) return await sh.has_results(session, query=query) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 9242194b..69c88cbe 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -121,7 +121,7 @@ from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo @@ -150,6 +150,7 @@ from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + class AsyncDatabaseClient: def __init__(self, db_url: Optional[str] = None): if db_url is None: @@ -187,7 +188,6 @@ async def wrapper(self, *args, **kwargs): return wrapper - @session_manager async def execute(self, session: AsyncSession, statement): await session.execute(statement) @@ -565,7 +565,6 @@ async def get_urls_with_html_data_and_without_auto_record_type_suggestion( model=AutoRecordTypeSuggestion ) - async def has_urls_with_html_data_and_without_models( self, session: AsyncSession, @@ -607,7 +606,6 @@ async def get_all( """Get all records of a model. Used primarily in testing.""" return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) - @session_manager async def load_root_url_cache(self, session: AsyncSession) -> dict[str, str]: statement = select(RootURL) @@ -631,7 +629,6 @@ async def get_urls( page=page, errors=errors )) - @session_manager async def initiate_task( self, @@ -743,7 +740,6 @@ async def get_urls_without_agency_suggestions( """Retrieve URLs without confirmed or suggested agencies.""" return await self.run_query_builder(GetPendingURLsWithoutAgencySuggestionsQueryBuilder()) - async def get_next_url_agency_for_annotation( self, user_id: int, @@ -754,7 +750,6 @@ async def get_next_url_agency_for_annotation( batch_id=batch_id )) - @session_manager async def upsert_new_agencies( self, @@ -776,7 +771,6 @@ async def upsert_new_agencies( agency.locality = suggestion.locality session.add(agency) - @session_manager async def add_confirmed_agency_url_links( self, @@ -876,7 +870,6 @@ async def reject_url( rejection_reason=rejection_reason )) - @session_manager async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]: """Retrieve a batch by ID.""" @@ -897,7 +890,11 @@ async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo] )) @session_manager - async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: + async def insert_url( + self, + session: AsyncSession, + url_info: URLInfo + ) -> int: """Insert a new URL into the database.""" url_entry = URL( url=url_info.url, @@ -916,21 +913,33 @@ async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: return url_entry.id @session_manager - async def get_url_info_by_url(self, session: AsyncSession, url: str) -> Optional[URLInfo]: + async def get_url_info_by_url( + self, + session: AsyncSession, + url: str + ) -> URLInfo | None: query = Select(URL).where(URL.url == url) raw_result = await session.execute(query) url = raw_result.scalars().first() return URLInfo(**url.__dict__) @session_manager - async def get_url_info_by_id(self, session: AsyncSession, url_id: int) -> Optional[URLInfo]: + async def get_url_info_by_id( + self, + session: AsyncSession, + url_id: int + ) -> URLInfo | None: query = Select(URL).where(URL.id == url_id) raw_result = await session.execute(query) url = raw_result.scalars().first() return URLInfo(**url.__dict__) @session_manager - async def insert_logs(self, session, log_infos: List[LogInfo]): + async def insert_logs( + self, + session: AsyncSession, + log_infos: list[LogInfo] + ) -> None: for log_info in log_infos: log = Log(log=log_info.log, batch_id=log_info.batch_id) if log_info.created_at is not None: @@ -938,7 +947,11 @@ async def insert_logs(self, session, log_infos: List[LogInfo]): session.add(log) @session_manager - async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]): + async def insert_duplicates( + self, + session: AsyncSession, + duplicate_infos: list[DuplicateInsertInfo] + ) -> None: for duplicate_info in duplicate_infos: duplicate = Duplicate( batch_id=duplicate_info.duplicate_batch_id, @@ -947,7 +960,11 @@ async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsert session.add(duplicate) @session_manager - async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> int: + async def insert_batch( + self, + session: AsyncSession, + batch_info: BatchInfo + ) -> int: """Insert a new batch into the database and return its ID.""" batch = Batch( strategy=batch_info.strategy, @@ -967,7 +984,11 @@ async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> in await session.flush() return batch.id - async def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: + async def insert_urls( + self, + url_infos: list[URLInfo], + batch_id: int + ) -> InsertURLsInfo: url_mappings = [] duplicates = [] for url_info in url_infos: @@ -995,14 +1016,14 @@ async def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertUR @session_manager async def update_batch_post_collection( self, - session, + session: AsyncSession, batch_id: int, total_url_count: int, original_url_count: int, duplicate_url_count: int, batch_status: BatchStatus, compute_time: float = None, - ): + ) -> None: query = Select(Batch).where(Batch.id == batch_id) result = await session.execute(query) @@ -1068,7 +1089,7 @@ async def delete_old_logs(self): async def get_next_url_for_all_annotations( self, batch_id: int | None = None - ) -> GetNextURLForAllAnnotationResponse: + ) -> GetNextURLForAllAnnotationResponse: return await self.run_query_builder(GetNextURLForAllAnnotationQueryBuilder(batch_id)) @session_manager @@ -1117,7 +1138,6 @@ async def upload_manual_batch( dto=dto )) - @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: query = select(URL).where(URL.url == url) @@ -1138,7 +1158,6 @@ async def get_batches_aggregated_metrics(self) -> GetMetricsBatchesAggregatedRes GetBatchesAggregatedMetricsQueryBuilder() ) - async def get_batches_breakdown_metrics( self, page: int diff --git a/src/db/client/sync.py b/src/db/client/sync.py index e2d21705..3f23f56e 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -11,9 +11,9 @@ from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.log.pydantic.info import LogInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.templates import Base from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.log.sqlalchemy import Log diff --git a/src/db/models/instantiations/url/core/pydantic.py b/src/db/models/instantiations/url/core/pydantic.py deleted file mode 100644 index e409c32c..00000000 --- a/src/db/models/instantiations/url/core/pydantic.py +++ /dev/null @@ -1,17 +0,0 @@ -import datetime -from typing import Optional - -from pydantic import BaseModel - -from src.collectors.enums import URLStatus - - -class URLInfo(BaseModel): - id: Optional[int] = None - batch_id: Optional[int] = None - url: str - collector_metadata: Optional[dict] = None - outcome: URLStatus = URLStatus.PENDING - updated_at: Optional[datetime.datetime] = None - created_at: Optional[datetime.datetime] = None - name: Optional[str] = None diff --git a/src/db/models/instantiations/url/core/pydantic/__init__.py b/src/db/models/instantiations/url/core/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/core/pydantic/info.py b/src/db/models/instantiations/url/core/pydantic/info.py new file mode 100644 index 00000000..6099db29 --- /dev/null +++ b/src/db/models/instantiations/url/core/pydantic/info.py @@ -0,0 +1,17 @@ +import datetime +from typing import Optional + +from pydantic import BaseModel + +from src.collectors.enums import URLStatus + + +class URLInfo(BaseModel): + id: int | None = None + batch_id: int | None= None + url: str + collector_metadata: dict | None = None + outcome: URLStatus = URLStatus.PENDING + updated_at: datetime.datetime | None = None + created_at: datetime.datetime | None = None + name: str | None = None diff --git a/src/db/models/instantiations/url/core/pydantic/insert.py b/src/db/models/instantiations/url/core/pydantic/insert.py new file mode 100644 index 00000000..230c93c0 --- /dev/null +++ b/src/db/models/instantiations/url/core/pydantic/insert.py @@ -0,0 +1,19 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.templates import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLInsertModel(BulkInsertableModel): + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URL + + url: str + collector_metadata: dict | None = None + name: str + outcome: URLStatus + record_type: RecordType \ No newline at end of file diff --git a/src/db/models/instantiations/url/web_metadata/pydantic.py b/src/db/models/instantiations/url/web_metadata/pydantic.py index e46a60b9..31a05d4a 100644 --- a/src/db/models/instantiations/url/web_metadata/pydantic.py +++ b/src/db/models/instantiations/url/web_metadata/pydantic.py @@ -1,7 +1,16 @@ +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.templates import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel class URLWebMetadataPydantic(BulkInsertableModel): + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLWebMetadata + + url_id: int accessed: bool status_code: int | None diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py index 48beb4b4..903bdc43 100644 --- a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py +++ b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py @@ -4,7 +4,7 @@ from src.db.models.templates import StandardBase -class UrlWebMetadata( +class URLWebMetadata( StandardBase, URLDependentMixin, CreatedAtMixin, diff --git a/src/external/url_request/probe/model.py b/src/external/url_request/probe/model.py index 6ddff60e..27caa680 100644 --- a/src/external/url_request/probe/model.py +++ b/src/external/url_request/probe/model.py @@ -8,8 +8,15 @@ class URLProbeResponse(BaseModel): error: str | None = None @model_validator(mode='after') - def check_error_mutually_exclusive_with_status_and_content(self): - if self.error is not None: - if self.status_code is not None or self.content_type is not None: - raise ValueError('Error is mutually exclusive with status code and content type') + def check_error_mutually_exclusive_with_content(self): + if self.error is None: + if self.content_type is None: + raise ValueError('Content type required if no error') + if self.status_code is None: + raise ValueError('Status code required if no error') + return self + + if self.content_type is not None: + raise ValueError('Content type mutually exclusive with error') + return self diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index 620e0318..f0bebaaf 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 9fd65eed..28a2483d 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -3,7 +3,7 @@ from src.core.enums import BatchStatus from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py index 5cae5a26..7eb5a7f9 100644 --- a/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py +++ b/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py @@ -26,7 +26,6 @@ async def test_agency_identification_task( ): """Test full flow of AgencyIdentificationTaskOperator""" - # Confirm does not yet meet prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/probe/conftest.py b/tests/automated/integration/tasks/url/probe/conftest.py index e69de29b..b8836a4b 100644 --- a/tests/automated/integration/tasks/url/probe/conftest.py +++ b/tests/automated/integration/tasks/url/probe/conftest.py @@ -0,0 +1,15 @@ +import pytest_asyncio + +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from src.external.url_request.core import URLRequestInterface +from tests.automated.integration.tasks.url.probe.constants import PATCH_ROOT +from tests.automated.integration.tasks.url.probe.setup.mocks.probe_manager import MockURLProbeManager + + +@pytest_asyncio.fixture +async def operator(adb_client_test, monkeypatch): + monkeypatch.setattr(PATCH_ROOT, MockURLProbeManager) + yield URLProbeTaskOperator( + adb_client=adb_client_test, + url_request_interface=URLRequestInterface() + ) diff --git a/tests/automated/integration/tasks/url/probe/constants.py b/tests/automated/integration/tasks/url/probe/constants.py new file mode 100644 index 00000000..6bc307e5 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/constants.py @@ -0,0 +1,3 @@ + + +PATCH_ROOT = "src.external.url_request.core.URLProbeManager" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/core.py b/tests/automated/integration/tasks/url/probe/setup/core.py new file mode 100644 index 00000000..1884798b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/core.py @@ -0,0 +1,22 @@ +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES + + +async def create_urls_in_db( + adb_client: AsyncDatabaseClient, +) -> None: + record_types = [rt for rt in RecordType] + urls = [] + for idx, entry in enumerate(SETUP_ENTRIES): + url = URLInsertModel( + url=entry.url, + outcome=entry.url_status, + name=f"test-url-probe-task-url-{idx}", + record_type=record_types[idx] + ) + urls.append(url) + await adb_client.bulk_insert(urls) + diff --git a/tests/automated/integration/tasks/url/probe/setup/data.py b/tests/automated/integration/tasks/url/probe/setup/data.py new file mode 100644 index 00000000..85ad2547 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/data.py @@ -0,0 +1,36 @@ +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry +from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse + +SETUP_ENTRIES: list[TestURLProbeTaskEntry] = [ + TestURLProbeTaskEntry( + url="https://pending.com", + url_status=URLStatus.PENDING, + url_probe_response=URLProbePlannedResponse( + status_code=200, + content_type="text/html", + error=None + ), + expected_accessed=True + ), + TestURLProbeTaskEntry( + url="https://submitted.com", + url_status=URLStatus.SUBMITTED, + url_probe_response=URLProbePlannedResponse( + status_code=500, + content_type=None, + error="test error" + ), + expected_accessed=True + ), + TestURLProbeTaskEntry( + url="https://failure.com", + url_status=URLStatus.ERROR, + url_probe_response=URLProbePlannedResponse( + status_code=None, + content_type=None, + error="URL not found" + ), + expected_accessed=False + ) +] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/format.py b/tests/automated/integration/tasks/url/probe/setup/format.py new file mode 100644 index 00000000..8cb2fdb0 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/format.py @@ -0,0 +1,24 @@ +from src.external.url_request.probe.model import URLProbeResponse +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry + + +def build_url_to_probe_response_map( +) -> dict[str, URLProbeResponse]: + d = {} + for entry in SETUP_ENTRIES: + probe_response = URLProbeResponse( + url=entry.url, + status_code=entry.url_probe_response.status_code, + content_type=entry.url_probe_response.content_type, + error=entry.url_probe_response.error + ) + d[entry.url] = probe_response + return d + +def build_url_to_entry_map( +) -> dict[str, TestURLProbeTaskEntry]: + d = {} + for entry in SETUP_ENTRIES: + d[entry.url] = entry + return d \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/manager.py b/tests/automated/integration/tasks/url/probe/setup/manager.py deleted file mode 100644 index 9b5bb48b..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/manager.py +++ /dev/null @@ -1,12 +0,0 @@ -from tests.helpers.data_creator.core import DBDataCreator - - -class TestURLProbeTaskSetupManager: - - def __init__( - self, - db_data_creator: DBDataCreator - ): - self.db_data_creator = db_data_creator - - async def setup(self): diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py b/tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py b/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py new file mode 100644 index 00000000..ac65ea9b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py @@ -0,0 +1,20 @@ +from aiohttp import ClientSession + +from src.external.url_request.probe.model import URLProbeResponse +from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_probe_response_map + + +class MockURLProbeManager: + + def __init__( + self, + session: ClientSession + ): + self.session = session + self._url_to_probe_response: dict[str, URLProbeResponse] = build_url_to_probe_response_map() + + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: + return [ + self._url_to_probe_response[url] + for url in urls + ] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/models/entry.py b/tests/automated/integration/tasks/url/probe/setup/models/entry.py index b39487ef..1031969e 100644 --- a/tests/automated/integration/tasks/url/probe/setup/models/entry.py +++ b/tests/automated/integration/tasks/url/probe/setup/models/entry.py @@ -1,11 +1,12 @@ -from pydantic import model_validator +from pydantic import model_validator, BaseModel from src.collectors.enums import URLStatus from src.external.url_request.probe.model import URLProbeResponse from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse -class TestURLProbeTaskEntry: +class TestURLProbeTaskEntry(BaseModel): url: str url_status: URLStatus url_probe_response: URLProbePlannedResponse + expected_accessed: bool diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/__init__.py b/tests/automated/integration/tasks/url/probe/setup/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/check.py b/tests/automated/integration/tasks/url/probe/setup/queries/check.py new file mode 100644 index 00000000..988efffc --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/queries/check.py @@ -0,0 +1,43 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES +from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_entry_map +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry + + +class CheckURLsInDBForURLProbeTaskQueryBuilder(QueryBuilderBase): + + def __init__(self): + super().__init__() + self._entries = SETUP_ENTRIES + self._url_to_entry_map: dict[ + str, TestURLProbeTaskEntry + ] = build_url_to_entry_map() + + async def run(self, session: AsyncSession) -> None: + + query = ( + select( + URL.url, + URLWebMetadata.accessed, + URLWebMetadata.status_code, + URLWebMetadata.content_type, + URLWebMetadata.error_message + ) + .join(URLWebMetadata, URL.id == URLWebMetadata.url_id) + ) + mappings = await sh.mappings(session, query=query) + assert len(mappings) == len(self._entries) + for mapping in mappings: + url = mapping["url"] + entry = self._url_to_entry_map[url] + assert entry.expected_accessed == mapping["accessed"] + assert entry.url_probe_response.status_code == mapping["status_code"] + assert entry.url_probe_response.content_type == mapping["content_type"] + assert entry.url_probe_response.error == mapping["error_message"] + diff --git a/tests/automated/integration/tasks/url/probe/test_core.py b/tests/automated/integration/tasks/url/probe/test_core.py index e69de29b..ee3fe50c 100644 --- a/tests/automated/integration/tasks/url/probe/test_core.py +++ b/tests/automated/integration/tasks/url/probe/test_core.py @@ -0,0 +1,33 @@ +import pytest + +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.setup.core import create_urls_in_db +from tests.automated.integration.tasks.url.probe.setup.queries.check import CheckURLsInDBForURLProbeTaskQueryBuilder + + +@pytest.mark.asyncio +async def test_url_probe_task( + operator: URLProbeTaskOperator +): + adb_client = operator.adb_client + # Check task does not yet meet pre-requisites + assert not await operator.meets_task_prerequisites() + + # Set up URLs + await create_urls_in_db(adb_client=adb_client) + + # Check task meets pre-requisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task no longer meets pre-requisites + assert not await operator.meets_task_prerequisites() + + # Check results as expected + await adb_client.run_query_builder( + CheckURLsInDBForURLProbeTaskQueryBuilder() + ) diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 2cc91449..fc7d0bba 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,7 +5,7 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 94c3fde6..66328993 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,7 +4,7 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index 672936e0..22695f44 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,7 +6,7 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls.py index 82324042..e4602dee 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls.py @@ -3,7 +3,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.simple_test_data_functions import generate_test_urls diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index d0a951f8..11259576 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -1,33 +1,21 @@ -from collections import defaultdict from datetime import datetime -from random import randint -from typing import List, Optional, Any +from typing import Optional, Any from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.api.endpoints.review.enums import RejectionReason from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.client.sync import DatabaseClient -from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType, URLStatus -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus -from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand -from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.batch import DBDataCreatorBatchCommand from tests.helpers.data_creator.commands.impl.batch_v2 import BatchV2Command from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand @@ -44,8 +32,6 @@ from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 -from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo -from tests.helpers.simple_test_data_functions import generate_test_urls class DBDataCreator: diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 857def21..d7942b4a 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -5,7 +5,7 @@ from src.external.url_request.core import URLRequestInterface from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from tests.helpers.data_creator.core import DBDataCreator URLS = [ From ab3071e83b7611971293c8816b9b7b732a7baba2 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 1 Aug 2025 15:28:08 -0400 Subject: [PATCH 07/13] Adjust URL Html Task logic. --- src/core/tasks/url/operators/html/core.py | 57 ++++--------------- src/core/tasks/url/operators/html/filter.py | 44 ++++++++++++++ .../url/operators/html/models/__init__.py | 0 .../operators/html/models/subsets/__init__.py | 0 .../html/models/subsets/error_404.py | 8 +++ .../html/models/subsets/success_error.py | 8 +++ src/db/statement_composer.py | 13 +++-- .../integration/tasks/url/html/test_task.py | 11 +++- .../commands/impl/url_metadata.py | 27 +++++++++ tests/helpers/data_creator/core.py | 13 +++++ 10 files changed, 129 insertions(+), 52 deletions(-) create mode 100644 src/core/tasks/url/operators/html/filter.py create mode 100644 src/core/tasks/url/operators/html/models/__init__.py create mode 100644 src/core/tasks/url/operators/html/models/subsets/__init__.py create mode 100644 src/core/tasks/url/operators/html/models/subsets/error_404.py create mode 100644 src/core/tasks/url/operators/html/models/subsets/success_error.py create mode 100644 tests/helpers/data_creator/commands/impl/url_metadata.py diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py index 25927e08..89cae250 100644 --- a/src/core/tasks/url/operators/html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -1,5 +1,5 @@ -from http import HTTPStatus - +from src.core.tasks.url.operators.html.filter import get_just_urls, separate_success_and_error_subsets, \ + separate_404_and_non_404_subsets from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo @@ -36,10 +36,14 @@ async def inner_task_logic(self): url_ids = [task_info.url_info.id for task_info in tdos] await self.link_urls_to_task(url_ids=url_ids) await self.get_raw_html_data_for_urls(tdos) - success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) - non_404_error_subset, is_404_error_subset = await self.separate_error_and_404_subsets(error_subset) - await self.process_html_data(success_subset) - await self.update_database(is_404_error_subset, non_404_error_subset, success_subset) + se_subsets = await separate_success_and_error_subsets(tdos) + err_subsets = await separate_404_and_non_404_subsets(se_subsets.error) + await self.process_html_data(se_subsets.success) + await self.update_database( + is_404_error_subset=err_subsets.is_404, + non_404_error_subset=err_subsets.not_404, + success_subset=se_subsets.success + ) async def update_database( self, @@ -51,9 +55,6 @@ async def update_database( await self.update_404s_in_database(is_404_error_subset) await self.update_html_data_in_database(success_subset) - async def get_just_urls(self, tdos: list[UrlHtmlTDO]): - return [task_info.url_info.url for task_info in tdos] - async def get_non_errored_urls_without_html_data(self): pending_urls: list[URLInfo] = await self.adb_client.get_non_errored_urls_without_html_data() tdos = [ @@ -64,46 +65,11 @@ async def get_non_errored_urls_without_html_data(self): return tdos async def get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]): - just_urls = await self.get_just_urls(tdos) + just_urls = await get_just_urls(tdos) url_response_infos = await self.url_request_interface.make_requests_with_html(just_urls) for tdto, url_response_info in zip(tdos, url_response_infos): tdto.url_response_info = url_response_info - async def separate_success_and_error_subsets( - self, - tdos: list[UrlHtmlTDO] - ) -> tuple[ - list[UrlHtmlTDO], # Successful - list[UrlHtmlTDO] # Error - ]: - errored_tdos = [] - successful_tdos = [] - for tdto in tdos: - if not tdto.url_response_info.success: - errored_tdos.append(tdto) - else: - successful_tdos.append(tdto) - return successful_tdos, errored_tdos - - async def separate_error_and_404_subsets( - self, - tdos: list[UrlHtmlTDO] - ) -> tuple[ - list[UrlHtmlTDO], # Error - list[UrlHtmlTDO] # 404 - ]: - tdos_error = [] - tdos_404 = [] - for tdo in tdos: - if tdo.url_response_info.status is None: - tdos_error.append(tdo) - continue - if tdo.url_response_info.status == HTTPStatus.NOT_FOUND: - tdos_404.append(tdo) - else: - tdos_error.append(tdo) - return tdos_error, tdos_404 - async def update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]): url_ids = [tdo.url_info.id for tdo in tdos_404] await self.adb_client.mark_all_as_404(url_ids) @@ -121,7 +87,6 @@ async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): async def process_html_data(self, tdos: list[UrlHtmlTDO]): for tdto in tdos: - html_tag_info = await self.html_parser.parse( url=tdto.url_info.url, html_content=tdto.url_response_info.html, diff --git a/src/core/tasks/url/operators/html/filter.py b/src/core/tasks/url/operators/html/filter.py new file mode 100644 index 00000000..f14840e6 --- /dev/null +++ b/src/core/tasks/url/operators/html/filter.py @@ -0,0 +1,44 @@ +from http import HTTPStatus + +from src.core.tasks.url.operators.html.models.subsets.error_404 import ErrorSubsets +from src.core.tasks.url.operators.html.models.subsets.success_error import SuccessErrorSubset +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +async def get_just_urls(tdos: list[UrlHtmlTDO]): + return [task_info.url_info.url for task_info in tdos] + + +async def separate_success_and_error_subsets( + tdos: list[UrlHtmlTDO] +) -> SuccessErrorSubset: + errored_tdos = [] + successful_tdos = [] + for tdto in tdos: + if not tdto.url_response_info.success: + errored_tdos.append(tdto) + else: + successful_tdos.append(tdto) + return SuccessErrorSubset( + success=successful_tdos, + error=errored_tdos + ) + + +async def separate_404_and_non_404_subsets( + tdos: list[UrlHtmlTDO] +) -> ErrorSubsets: + tdos_error = [] + tdos_404 = [] + for tdo in tdos: + if tdo.url_response_info.status is None: + tdos_error.append(tdo) + continue + if tdo.url_response_info.status == HTTPStatus.NOT_FOUND: + tdos_404.append(tdo) + else: + tdos_error.append(tdo) + return ErrorSubsets( + not_404=tdos_error, + is_404=tdos_404 + ) diff --git a/src/core/tasks/url/operators/html/models/__init__.py b/src/core/tasks/url/operators/html/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/html/models/subsets/__init__.py b/src/core/tasks/url/operators/html/models/subsets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/html/models/subsets/error_404.py b/src/core/tasks/url/operators/html/models/subsets/error_404.py new file mode 100644 index 00000000..f526368c --- /dev/null +++ b/src/core/tasks/url/operators/html/models/subsets/error_404.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +class ErrorSubsets(BaseModel): + is_404: list[UrlHtmlTDO] + not_404: list[UrlHtmlTDO] \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/models/subsets/success_error.py b/src/core/tasks/url/operators/html/models/subsets/success_error.py new file mode 100644 index 00000000..75429a6e --- /dev/null +++ b/src/core/tasks/url/operators/html/models/subsets/success_error.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +class SuccessErrorSubset(BaseModel): + success: list[UrlHtmlTDO] + error: list[UrlHtmlTDO] \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index a6f468ee..5af4ba5c 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -16,6 +16,7 @@ from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -35,11 +36,13 @@ def has_non_errored_urls_without_html_data() -> Select: where(Task.task_status == BatchStatus.READY_TO_LABEL.value) ) query = ( - select(URL). - outerjoin(URLHTMLContent). - where(URLHTMLContent.id == None). - where(~exists(exclude_subquery)). - where(URL.outcome.in_( + select(URL) + .join(URLWebMetadata) + .outerjoin(URLHTMLContent) + .where(URLHTMLContent.id == None) + .where(~exists(exclude_subquery)) + .where(URLWebMetadata.content_type.like("%html%")) + .where(URL.outcome.in_( [ URLStatus.PENDING, URLStatus.NOT_RELEVANT, diff --git a/tests/automated/integration/tasks/url/html/test_task.py b/tests/automated/integration/tasks/url/html/test_task.py index 2592713f..da6753a4 100644 --- a/tests/automated/integration/tasks/url/html/test_task.py +++ b/tests/automated/integration/tasks/url/html/test_task.py @@ -3,7 +3,8 @@ from src.db.enums import TaskType from tests.automated.integration.tasks.url.html.asserts import assert_success_url_has_two_html_content_entries, assert_404_url_has_404_status, assert_task_has_one_url_error, \ assert_task_type_is_html, assert_html_task_ran_without_error, assert_url_has_one_compressed_html_content_entry -from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_url_task_has_expected_run_info +from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_url_task_has_expected_run_info, \ + assert_prereqs_met from tests.automated.integration.tasks.url.html.setup import setup_urls, setup_operator from tests.helpers.data_creator.core import DBDataCreator @@ -16,7 +17,15 @@ async def test_url_html_task(db_data_creator: DBDataCreator): # No URLs were created, the prereqs should not be met await assert_prereqs_not_met(operator) + + # Add URLs without adding web metadata, the prereqs should not be met url_ids = await setup_urls(db_data_creator) + await assert_prereqs_not_met(operator) + + # Add web metadata, the prereqs should be met + await db_data_creator.url_metadata(url_ids) + await assert_prereqs_met(operator) + success_url_id = url_ids[0] not_found_url_id = url_ids[1] diff --git a/tests/helpers/data_creator/commands/impl/url_metadata.py b/tests/helpers/data_creator/commands/impl/url_metadata.py new file mode 100644 index 00000000..6eee58ed --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/url_metadata.py @@ -0,0 +1,27 @@ +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase + + +class URLMetadataCommand(DBDataCreatorCommandBase): + + def __init__( + self, + url_ids: list[int], + content_type: str = "text/html" + ): + super().__init__() + self.url_ids = url_ids + self.content_type = content_type + + async def run(self) -> None: + url_metadata_infos = [] + for url_id in self.url_ids: + url_metadata = URLWebMetadataPydantic( + url_id=url_id, + accessed=True, + status_code=200, + content_type=self.content_type, + error_message=None + ) + url_metadata_infos.append(url_metadata) + await self.adb_client.bulk_insert(url_metadata_infos) \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 11259576..070c9657 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -26,6 +26,7 @@ from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.user.record_type import UserRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.relevant import UserRelevantSuggestionCommand +from tests.helpers.data_creator.commands.impl.url_metadata import URLMetadataCommand from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response @@ -352,3 +353,15 @@ async def agency_user_suggestions( agency_annotation_info=agency_annotation_info ) ) + + async def url_metadata( + self, + url_ids: list[int], + content_type: str = "text/html" + ) -> None: + await self.run_command( + URLMetadataCommand( + url_ids=url_ids, + content_type=content_type + ) + ) From b7a0af0e66f8fbbeb680d0ef05ea60e2204ab5ec Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 1 Aug 2025 15:47:45 -0400 Subject: [PATCH 08/13] Add task to loader --- local_database/classes/DockerManager.py | 23 ++++++++++++++++------- src/core/tasks/url/loader.py | 9 +++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/local_database/classes/DockerManager.py b/local_database/classes/DockerManager.py index ac294dc1..fc32c3bc 100644 --- a/local_database/classes/DockerManager.py +++ b/local_database/classes/DockerManager.py @@ -4,6 +4,8 @@ import docker from docker.errors import APIError +from docker.models.containers import Container +from docker.models.networks import Network from local_database.DTOs import DockerfileInfo, DockerInfo from local_database.classes.DockerClient import DockerClient @@ -20,7 +22,7 @@ def __init__(self): self.network = self.start_network() @staticmethod - def start_docker_engine(): + def start_docker_engine() -> None: system = platform.system() match system: @@ -41,7 +43,7 @@ def start_docker_engine(): sys.exit(1) @staticmethod - def is_docker_running(): + def is_docker_running() -> bool: try: client = docker.from_env() client.ping() @@ -50,16 +52,23 @@ def is_docker_running(): print(f"Docker is not running: {e}") return False - def run_command(self, command: str, container_id: str): + def run_command( + self, + command: str, + container_id: str + ) -> None: self.client.run_command(command, container_id) - def start_network(self): + def start_network(self) -> Network: return self.client.start_network(self.network_name) - def stop_network(self): + def stop_network(self) -> None: self.client.stop_network(self.network_name) - def get_image(self, dockerfile_info: DockerfileInfo): + def get_image( + self, + dockerfile_info: DockerfileInfo + ) -> None: self.client.get_image(dockerfile_info) def run_container( @@ -74,5 +83,5 @@ def run_container( ) return DockerContainer(self.client, raw_container) - def get_containers(self): + def get_containers(self) -> list[Container]: return self.client.client.containers.list() \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index f54ff025..59896f94 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,6 +7,7 @@ from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier @@ -101,8 +102,16 @@ async def get_url_auto_relevance_task_operator(self): ) return operator + async def get_url_probe_task_operator(self): + operator = URLProbeTaskOperator( + adb_client=self.adb_client, + url_request_interface=self.url_request_interface + ) + return operator + async def get_task_operators(self) -> list[URLTaskOperatorBase]: return [ + await self.get_url_probe_task_operator(), await self.get_url_html_task_operator(), await self.get_url_duplicate_task_operator(), await self.get_url_404_probe_task_operator(), From 7a78aedcc1640796d354665145d575a0ed61a79e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 2 Aug 2025 07:25:48 -0400 Subject: [PATCH 09/13] Fix bugs and refine --- .../2025_07_31_1536-99eceed6e614_add_web_status_info_table.py | 2 ++ src/external/url_request/core.py | 4 ++-- src/external/url_request/probe/core.py | 4 ++-- .../integration/tasks/url/probe/setup/models/entry.py | 3 +-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py index 077d8277..6edeaff0 100644 --- a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py +++ b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py @@ -91,9 +91,11 @@ def _drop_url_html_info_table() -> None: def upgrade() -> None: _create_url_html_info_table() + _add_url_probe_task_type_enum() def downgrade() -> None: _drop_url_html_info_table() # Drop Enums WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) + _drop_url_probe_task_type_enum() \ No newline at end of file diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index e2143bcc..d17164d7 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -1,4 +1,4 @@ -from aiohttp import ClientSession +from aiohttp import ClientSession, ClientTimeout from src.external.url_request.dtos.url_response import URLResponseInfo from src.external.url_request.probe.core import URLProbeManager @@ -16,6 +16,6 @@ async def make_requests_with_html( @staticmethod async def probe_urls(urls: list[str]) -> list[URLProbeResponse]: - async with ClientSession() as session: + async with ClientSession(timeout=ClientTimeout(total=30)) as session: manager = URLProbeManager(session=session) return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index b15286d3..0b5bb934 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -4,7 +4,7 @@ from src.external.url_request.probe.format import format_client_response, format_client_response_error, format_error from src.external.url_request.probe.model import URLProbeResponse - +from tqdm.asyncio import tqdm_asyncio class URLProbeManager: @@ -15,7 +15,7 @@ def __init__( self.session = session async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: - return await asyncio.gather(*[self.probe_url(url) for url in urls]) + return await tqdm_asyncio.gather(*[self.probe_url(url) for url in urls]) async def probe_url(self, url: str) -> URLProbeResponse: result = await self.head(url) diff --git a/tests/automated/integration/tasks/url/probe/setup/models/entry.py b/tests/automated/integration/tasks/url/probe/setup/models/entry.py index 1031969e..6432de9c 100644 --- a/tests/automated/integration/tasks/url/probe/setup/models/entry.py +++ b/tests/automated/integration/tasks/url/probe/setup/models/entry.py @@ -1,7 +1,6 @@ -from pydantic import model_validator, BaseModel +from pydantic import BaseModel from src.collectors.enums import URLStatus -from src.external.url_request.probe.model import URLProbeResponse from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse From 98edd9a5822cdbf0070d5693d71a76aefeb728d9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 2 Aug 2025 07:26:03 -0400 Subject: [PATCH 10/13] Refactor --- local_database/classes/DockerClient.py | 12 +++++++----- local_database/classes/DockerContainer.py | 8 ++++---- local_database/classes/TimestampChecker.py | 15 +++++++-------- local_database/create_database.py | 6 +++--- local_database/setup.py | 13 +++++++++---- start_mirrored_local_app.py | 12 ++++++------ .../manual/external/url_request/test_url_probe.py | 2 +- 7 files changed, 37 insertions(+), 31 deletions(-) diff --git a/local_database/classes/DockerClient.py b/local_database/classes/DockerClient.py index ca9d535b..5c33e7d9 100644 --- a/local_database/classes/DockerClient.py +++ b/local_database/classes/DockerClient.py @@ -1,5 +1,7 @@ import docker from docker.errors import NotFound, APIError +from docker.models.containers import Container +from docker.models.networks import Network from local_database.DTOs import DockerfileInfo, DockerInfo @@ -9,7 +11,7 @@ class DockerClient: def __init__(self): self.client = docker.from_env() - def run_command(self, command: str, container_id: str): + def run_command(self, command: str, container_id: str) -> None: exec_id = self.client.api.exec_create( container_id, cmd=command, @@ -20,7 +22,7 @@ def run_command(self, command: str, container_id: str): for line in output_stream: print(line.decode().rstrip()) - def start_network(self, network_name): + def start_network(self, network_name) -> Network: try: self.client.networks.create(network_name, driver="bridge") except APIError as e: @@ -30,14 +32,14 @@ def start_network(self, network_name): print("Network already exists") return self.client.networks.get(network_name) - def stop_network(self, network_name): + def stop_network(self, network_name) -> None: self.client.networks.get(network_name).remove() def get_image( self, dockerfile_info: DockerfileInfo, force_rebuild: bool = False - ): + ) -> None: if dockerfile_info.dockerfile_directory: # Build image from Dockerfile self.client.images.build( @@ -58,7 +60,7 @@ def get_image( except NotFound: self.client.images.pull(dockerfile_info.image_tag) - def get_existing_container(self, docker_info_name: str): + def get_existing_container(self, docker_info_name: str) -> Container | None: try: return self.client.containers.get(docker_info_name) except NotFound: diff --git a/local_database/classes/DockerContainer.py b/local_database/classes/DockerContainer.py index 33b71ce0..0a86e601 100644 --- a/local_database/classes/DockerContainer.py +++ b/local_database/classes/DockerContainer.py @@ -11,19 +11,19 @@ def __init__(self, dc: DockerClient, container: Container): self.dc = dc self.container = container - def run_command(self, command: str): + def run_command(self, command: str) -> None: self.dc.run_command(command, self.container.id) - def stop(self): + def stop(self) -> None: self.container.stop() - def log_to_file(self): + def log_to_file(self) -> None: logs = self.container.logs(stdout=True, stderr=True) container_name = self.container.name with open(f"{container_name}.log", "wb") as f: f.write(logs) - def wait_for_pg_to_be_ready(self): + def wait_for_pg_to_be_ready(self) -> None: for i in range(30): exit_code, output = self.container.exec_run("pg_isready") print(output) diff --git a/local_database/classes/TimestampChecker.py b/local_database/classes/TimestampChecker.py index 56779fd4..fc2c25a0 100644 --- a/local_database/classes/TimestampChecker.py +++ b/local_database/classes/TimestampChecker.py @@ -1,27 +1,26 @@ -import datetime import os -from typing import Optional +from datetime import datetime, timedelta class TimestampChecker: def __init__(self): - self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time() + self.last_run_time: datetime | None = self.load_last_run_time() - def load_last_run_time(self) -> Optional[datetime.datetime]: + def load_last_run_time(self) -> datetime | None: # Check if file `last_run.txt` exists # If it does, load the last run time if os.path.exists("local_state/last_run.txt"): with open("local_state/last_run.txt", "r") as f: - return datetime.datetime.strptime( + return datetime.strptime( f.read(), "%Y-%m-%d %H:%M:%S" ) return None - def last_run_within_24_hours(self): + def last_run_within_24_hours(self) -> bool: if self.last_run_time is None: return False - return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1) + return datetime.now() - self.last_run_time < timedelta(days=1) def set_last_run_time(self): # If directory `local_state` doesn't exist, create it @@ -29,4 +28,4 @@ def set_last_run_time(self): os.makedirs("local_state") with open("local_state/last_run.txt", "w") as f: - f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) diff --git a/local_database/create_database.py b/local_database/create_database.py index 67eae70b..e18cbd2a 100644 --- a/local_database/create_database.py +++ b/local_database/create_database.py @@ -15,7 +15,7 @@ # Connect to the default 'postgres' database to create other databases -def connect(database="postgres", autocommit=True): +def connect(database="postgres", autocommit=True) -> psycopg2.extensions.connection: conn = psycopg2.connect( dbname=database, user=POSTGRES_USER, @@ -27,7 +27,7 @@ def connect(database="postgres", autocommit=True): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) return conn -def create_database(db_name): +def create_database(db_name: str) -> None: conn = connect("postgres") with conn.cursor() as cur: cur.execute(sql.SQL(""" @@ -48,7 +48,7 @@ def create_database(db_name): except Exception as e: print(f"❌ Failed to create {db_name}: {e}") -def main(): +def main() -> None: print("Creating databases...") create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME) diff --git a/local_database/setup.py b/local_database/setup.py index 99ff1da9..64f5af48 100644 --- a/local_database/setup.py +++ b/local_database/setup.py @@ -7,14 +7,19 @@ MAX_RETRIES = 20 SLEEP_SECONDS = 1 -def run_command(cmd, check=True, capture_output=False, **kwargs): +def run_command( + cmd: str, + check: bool = True, + capture_output: bool = False, + **kwargs: dict +) -> subprocess.CompletedProcess: try: return subprocess.run(cmd, shell=True, check=check, capture_output=capture_output, text=True, **kwargs) except subprocess.CalledProcessError as e: print(f"Command '{cmd}' failed: {e}") sys.exit(1) -def get_postgres_container_id(): +def get_postgres_container_id() -> str: result = run_command(f"docker-compose ps -q {POSTGRES_SERVICE_NAME}", capture_output=True) container_id = result.stdout.strip() if not container_id: @@ -22,7 +27,7 @@ def get_postgres_container_id(): sys.exit(1) return container_id -def wait_for_postgres(container_id): +def wait_for_postgres(container_id: str) -> None: print("Waiting for Postgres to be ready...") for i in range(MAX_RETRIES): try: @@ -36,7 +41,7 @@ def wait_for_postgres(container_id): print("Postgres did not become ready in time.") sys.exit(1) -def main(): +def main() -> None: print("Stopping Docker Compose...") run_command("docker-compose down") diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index e2bd10e3..9190fece 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -63,15 +63,15 @@ def _run_database_restore(data_dump_container) -> None: def _run_dump_if_longer_than_24_hours( - checker, + checker: TimestampChecker, data_dump_container -): +) -> None: if checker.last_run_within_24_hours(): print("Last run within 24 hours, skipping dump...") - else: - data_dump_container.run_command( - DUMP_SH_DOCKER_PATH, - ) + return + data_dump_container.run_command( + DUMP_SH_DOCKER_PATH, + ) if __name__ == "__main__": diff --git a/tests/manual/external/url_request/test_url_probe.py b/tests/manual/external/url_request/test_url_probe.py index 75396746..d13d0f80 100644 --- a/tests/manual/external/url_request/test_url_probe.py +++ b/tests/manual/external/url_request/test_url_probe.py @@ -1,6 +1,6 @@ import pytest -from src.external.url_request.probe import URLProbeManager +from src.external.url_request.probe.core import URLProbeManager URLS = [ "https://www.google.com", From 158f211223ea4d67727199d16f1be46c3c19a9b5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 3 Aug 2025 08:44:56 -0400 Subject: [PATCH 11/13] Refine HTML task --- alembic/env.py | 3 +- ...-99eceed6e614_add_web_status_info_table.py | 87 +++++++++++++---- .../huggingface/queries/check/requester.py | 2 +- .../scheduled/huggingface/queries/get/core.py | 4 +- .../auto_relevant/queries/get_tdos.py | 2 +- .../url/operators/html/content_info_getter.py | 3 +- src/core/tasks/url/operators/html/core.py | 96 +++++++------------ src/core/tasks/url/operators/html/filter.py | 43 ++------- ...nding_urls_without_html_data.py => get.py} | 0 .../operators/html/queries/insert/__init__.py | 0 .../operators/html/queries/insert/convert.py | 73 ++++++++++++++ .../operators/html/queries/insert/query.py | 30 ++++++ .../operators/html/scraper/parser/mapping.py | 2 +- src/core/tasks/url/operators/html/tdo.py | 4 +- ...pending_urls_missing_miscellaneous_data.py | 2 +- src/db/client/async_.py | 16 ++-- src/db/client/sync.py | 2 +- src/db/dto_converter.py | 5 +- src/db/dtos/url/html_content.py | 30 +++--- src/db/helpers/session/parser.py | 2 +- src/db/helpers/session/session_helper.py | 5 +- .../instantiations/agency/pydantic/upsert.py | 2 +- .../instantiations/agency/sqlalchemy.py | 4 +- .../models/instantiations/backlog_snapshot.py | 4 +- .../models/instantiations/batch/sqlalchemy.py | 4 +- src/db/models/instantiations/change_log.py | 4 +- .../instantiations/duplicate/sqlalchemy.py | 4 +- .../models/instantiations/link/batch_url.py | 4 +- src/db/models/instantiations/link/task_url.py | 2 +- .../link/url_agency/sqlalchemy.py | 4 +- .../models/instantiations/log/sqlalchemy.py | 4 +- src/db/models/instantiations/missing.py | 4 +- .../models/instantiations/root_url_cache.py | 4 +- .../instantiations/state/huggingface.py | 2 +- .../instantiations/state/sync/agencies.py | 2 +- .../instantiations/state/sync/data_sources.py | 2 +- src/db/models/instantiations/task/core.py | 4 +- src/db/models/instantiations/task/error.py | 4 +- .../url/checked_for_duplicate.py | 4 +- .../url/core/pydantic/insert.py | 2 +- .../instantiations/url/core/sqlalchemy.py | 16 +++- .../url/data_source/sqlalchemy.py | 4 +- .../instantiations/url/error_info/pydantic.py | 12 ++- .../url/error_info/sqlalchemy.py | 4 +- .../instantiations/url/html/__init__.py | 0 .../url/html/compressed/__init__.py | 0 .../url/html/compressed/pydantic.py | 13 +++ .../compressed/sqlalchemy.py} | 4 +- .../url/html/content/__init__.py | 0 .../instantiations/url/html/content/enums.py | 13 +++ .../url/html/content/pydantic.py | 0 .../content/sqlalchemy.py} | 8 +- .../url/optional_data_source_metadata.py | 4 +- .../instantiations/url/probed_for_404.py | 4 +- .../instantiations/url/reviewing_user.py | 4 +- .../url/scrape_info/__init__.py | 0 .../instantiations/url/scrape_info/enums.py | 6 ++ .../url/scrape_info/pydantic.py | 13 +++ .../url/scrape_info/sqlalchemy.py | 17 ++++ .../url/suggestion/agency/auto.py | 4 +- .../url/suggestion/agency/user.py | 4 +- .../url/suggestion/record_type/auto.py | 4 +- .../url/suggestion/record_type/user.py | 4 +- .../suggestion/relevant/auto/sqlalchemy.py | 4 +- .../url/suggestion/relevant/user.py | 4 +- .../url/web_metadata/pydantic.py | 2 +- .../url/web_metadata/sqlalchemy.py | 4 +- src/db/models/templates.py | 11 --- src/db/models/templates_/__init__.py | 0 src/db/models/templates_/base.py | 4 + src/db/models/templates_/standard.py | 14 +++ src/db/models/templates_/with_id.py | 11 +++ .../core/get/html_content_info.py | 2 +- src/db/statement_composer.py | 24 ++--- .../templates/protocols/sa_correlated/core.py | 2 +- .../protocols/sa_correlated/with_id.py | 2 +- src/external/url_request/dtos/url_response.py | 8 +- src/util/alembic_helpers.py | 15 ++- .../integration/db/structure/testers/table.py | 2 +- .../huggingface/setup/queries/setup.py | 2 +- .../integration/tasks/url/html/asserts.py | 52 ---------- .../tasks/url/html/check/__init__.py | 0 .../tasks/url/html/check/manager.py | 66 +++++++++++++ .../tasks/url/html/mocks/constants.py | 3 - .../tasks/url/html/mocks/methods.py | 46 --------- .../mocks/url_request_interface/__init__.py | 0 .../html/mocks/url_request_interface/core.py | 11 +++ .../html/mocks/url_request_interface/setup.py | 45 +++++++++ .../integration/tasks/url/html/setup.py | 41 -------- .../tasks/url/html/setup/__init__.py | 0 .../integration/tasks/url/html/setup/data.py | 94 ++++++++++++++++++ .../tasks/url/html/setup/manager.py | 87 +++++++++++++++++ .../tasks/url/html/setup/models/__init__.py | 0 .../tasks/url/html/setup/models/entry.py | 34 +++++++ .../tasks/url/html/setup/models/record.py | 8 ++ .../integration/tasks/url/html/test_task.py | 48 ++++------ .../data_creator/commands/impl/html_data.py | 11 ++- .../commands/impl/url_metadata.py | 8 +- tests/helpers/data_creator/core.py | 7 +- tests/helpers/setup/wipe.py | 2 +- tests/helpers/simple_test_data_functions.py | 14 +++ .../test_deepseek_record_classifier.py | 2 +- .../test_openai_record_classifier.py | 2 +- 103 files changed, 858 insertions(+), 447 deletions(-) rename src/core/tasks/url/operators/html/queries/{get_pending_urls_without_html_data.py => get.py} (100%) create mode 100644 src/core/tasks/url/operators/html/queries/insert/__init__.py create mode 100644 src/core/tasks/url/operators/html/queries/insert/convert.py create mode 100644 src/core/tasks/url/operators/html/queries/insert/query.py create mode 100644 src/db/models/instantiations/url/html/__init__.py create mode 100644 src/db/models/instantiations/url/html/compressed/__init__.py create mode 100644 src/db/models/instantiations/url/html/compressed/pydantic.py rename src/db/models/instantiations/url/{compressed_html.py => html/compressed/sqlalchemy.py} (86%) create mode 100644 src/db/models/instantiations/url/html/content/__init__.py create mode 100644 src/db/models/instantiations/url/html/content/enums.py create mode 100644 src/db/models/instantiations/url/html/content/pydantic.py rename src/db/models/instantiations/url/{html_content.py => html/content/sqlalchemy.py} (82%) create mode 100644 src/db/models/instantiations/url/scrape_info/__init__.py create mode 100644 src/db/models/instantiations/url/scrape_info/enums.py create mode 100644 src/db/models/instantiations/url/scrape_info/pydantic.py create mode 100644 src/db/models/instantiations/url/scrape_info/sqlalchemy.py delete mode 100644 src/db/models/templates.py create mode 100644 src/db/models/templates_/__init__.py create mode 100644 src/db/models/templates_/base.py create mode 100644 src/db/models/templates_/standard.py create mode 100644 src/db/models/templates_/with_id.py delete mode 100644 tests/automated/integration/tasks/url/html/asserts.py create mode 100644 tests/automated/integration/tasks/url/html/check/__init__.py create mode 100644 tests/automated/integration/tasks/url/html/check/manager.py delete mode 100644 tests/automated/integration/tasks/url/html/mocks/constants.py create mode 100644 tests/automated/integration/tasks/url/html/mocks/url_request_interface/__init__.py create mode 100644 tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py create mode 100644 tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py delete mode 100644 tests/automated/integration/tasks/url/html/setup.py create mode 100644 tests/automated/integration/tasks/url/html/setup/__init__.py create mode 100644 tests/automated/integration/tasks/url/html/setup/data.py create mode 100644 tests/automated/integration/tasks/url/html/setup/manager.py create mode 100644 tests/automated/integration/tasks/url/html/setup/models/__init__.py create mode 100644 tests/automated/integration/tasks/url/html/setup/models/entry.py create mode 100644 tests/automated/integration/tasks/url/html/setup/models/record.py diff --git a/alembic/env.py b/alembic/env.py index 2cf7e6c8..ff14698b 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,4 +1,3 @@ -import logging from datetime import datetime from logging.config import fileConfig @@ -7,7 +6,7 @@ from sqlalchemy import pool from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.templates import Base +from src.db.models.templates_.base import Base # this is the Alembic Config object, which provides # access to the values within the .ini file in use. diff --git a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py index 6edeaff0..891bef3a 100644 --- a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py +++ b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py @@ -25,8 +25,75 @@ "404_not_found", name="web_status" ) +SCRAPE_STATUS_ENUM = sa.Enum( + "success", + "error", + name="scrape_status", +) + +URL_WEB_METADATA_TABLE_NAME = 'url_web_metadata' +URL_SCRAPE_INFO = 'url_scrape_info' + + + + + +def upgrade() -> None: + _create_url_html_info_table() + _add_url_probe_task_type_enum() + _set_up_scrape_info_table() + _use_existing_html_data_to_add_scrape_info() + +def _use_existing_html_data_to_add_scrape_info(): + op.execute( + f""" + INSERT INTO {URL_SCRAPE_INFO} (url_id, status) + SELECT url_id, 'success'::scrape_status + FROM url_compressed_html + """ + ) + op.execute( + f""" + INSERT INTO {URL_SCRAPE_INFO} (url_id, status) + SELECT distinct(url_id), 'success'::scrape_status + FROM url_html_content + LEFT JOIN URL_COMPRESSED_HTML USING (url_id) + WHERE URL_COMPRESSED_HTML.url_id IS NULL + """ + ) + +def downgrade() -> None: + _drop_scrape_info_table() + # Drop Enums + WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) + _drop_url_probe_task_type_enum() + _tear_down_scrape_info_table() + + +def _set_up_scrape_info_table(): + op.create_table( + URL_SCRAPE_INFO, + id_column(), + url_id_column(), + sa.Column( + 'status', + SCRAPE_STATUS_ENUM, + nullable=False, + comment='The status of the most recent scrape attempt.' + ), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_url_scrape_info_url_id') + ) + + + + +def _tear_down_scrape_info_table(): + op.drop_table(URL_SCRAPE_INFO) + # Drop enum + SCRAPE_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) -TABLE_NAME = 'url_web_metadata' def _add_url_probe_task_type_enum() -> None: switch_enum_type( @@ -71,7 +138,7 @@ def _drop_url_probe_task_type_enum() -> None: def _create_url_html_info_table() -> None: op.create_table( - TABLE_NAME, + URL_WEB_METADATA_TABLE_NAME, id_column(), url_id_column(), sa.Column('accessed', sa.Boolean(), nullable=False), @@ -85,17 +152,5 @@ def _create_url_html_info_table() -> None: sa.CheckConstraint('status_code <= 999', name='ck_url_web_status_info_status_code_max'), ) -def _drop_url_html_info_table() -> None: - op.drop_table(TABLE_NAME) - - -def upgrade() -> None: - _create_url_html_info_table() - _add_url_probe_task_type_enum() - - -def downgrade() -> None: - _drop_url_html_info_table() - # Drop Enums - WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) - _drop_url_probe_task_type_enum() \ No newline at end of file +def _drop_scrape_info_table() -> None: + op.drop_table(URL_WEB_METADATA_TABLE_NAME) diff --git a/src/core/tasks/scheduled/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/huggingface/queries/check/requester.py index 6af94560..33a79043 100644 --- a/src/core/tasks/scheduled/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/huggingface/queries/check/requester.py @@ -7,7 +7,7 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh from src.db.models.instantiations.state.huggingface import HuggingFaceUploadState -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/src/core/tasks/scheduled/huggingface/queries/get/core.py b/src/core/tasks/scheduled/huggingface/queries/get/core.py index 7deea322..906f4d4f 100644 --- a/src/core/tasks/scheduled/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/huggingface/queries/get/core.py @@ -1,5 +1,3 @@ -from typing import Any - from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -7,7 +5,7 @@ from src.core.tasks.scheduled.huggingface.queries.get.convert import convert_url_status_to_relevant, \ convert_fine_to_coarse_record_type from src.core.tasks.scheduled.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index 78e4c983..2ec72836 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -6,7 +6,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py index d861e265..fb7bdd59 100644 --- a/src/core/tasks/url/operators/html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,5 +1,6 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType class HTMLContentInfoGetter: diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py index 89cae250..00c1d1c3 100644 --- a/src/core/tasks/url/operators/html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -1,14 +1,11 @@ -from src.core.tasks.url.operators.html.filter import get_just_urls, separate_success_and_error_subsets, \ - separate_404_and_non_404_subsets -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.dtos.url.raw_html import RawHTMLInfo -from src.db.enums import TaskType -from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO -from src.core.tasks.url.operators.html.content_info_getter import HTMLContentInfoGetter from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.html.filter import filter_just_urls, filter_404_subset +from src.core.tasks.url.operators.html.queries.insert.query import InsertURLHTMLInfoQueryBuilder from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.external.url_request.core import URLRequestInterface @@ -25,37 +22,26 @@ def __init__( self.html_parser = html_parser @property - def task_type(self): + def task_type(self) -> TaskType: return TaskType.HTML - async def meets_task_prerequisites(self): + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.has_non_errored_urls_without_html_data() - async def inner_task_logic(self): - tdos = await self.get_non_errored_urls_without_html_data() + async def inner_task_logic(self) -> None: + tdos = await self._get_non_errored_urls_without_html_data() url_ids = [task_info.url_info.id for task_info in tdos] await self.link_urls_to_task(url_ids=url_ids) - await self.get_raw_html_data_for_urls(tdos) - se_subsets = await separate_success_and_error_subsets(tdos) - err_subsets = await separate_404_and_non_404_subsets(se_subsets.error) - await self.process_html_data(se_subsets.success) - await self.update_database( - is_404_error_subset=err_subsets.is_404, - non_404_error_subset=err_subsets.not_404, - success_subset=se_subsets.success - ) - async def update_database( - self, - is_404_error_subset: list[UrlHtmlTDO], - non_404_error_subset: list[UrlHtmlTDO], - success_subset: list[UrlHtmlTDO] - ): - await self.update_errors_in_database(non_404_error_subset) - await self.update_404s_in_database(is_404_error_subset) - await self.update_html_data_in_database(success_subset) + await self._get_raw_html_data_for_urls(tdos) + await self._process_html_data(tdos) + + tdos_404 = await filter_404_subset(tdos) + await self._update_404s_in_database(tdos_404) + await self._update_html_data_in_database(tdos) + - async def get_non_errored_urls_without_html_data(self): + async def _get_non_errored_urls_without_html_data(self) -> list[UrlHtmlTDO]: pending_urls: list[URLInfo] = await self.adb_client.get_non_errored_urls_without_html_data() tdos = [ UrlHtmlTDO( @@ -64,29 +50,25 @@ async def get_non_errored_urls_without_html_data(self): ] return tdos - async def get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]): - just_urls = await get_just_urls(tdos) + async def _get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]) -> None: + just_urls = await filter_just_urls(tdos) url_response_infos = await self.url_request_interface.make_requests_with_html(just_urls) for tdto, url_response_info in zip(tdos, url_response_infos): tdto.url_response_info = url_response_info - async def update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]): + async def _update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]) -> None: url_ids = [tdo.url_info.id for tdo in tdos_404] await self.adb_client.mark_all_as_404(url_ids) - async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): - error_infos = [] - for error_tdo in error_tdos: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=error_tdo.url_info.id, - error=str(error_tdo.url_response_info.exception), - ) - error_infos.append(error_info) - await self.adb_client.add_url_error_infos(error_infos) - async def process_html_data(self, tdos: list[UrlHtmlTDO]): + async def _process_html_data(self, tdos: list[UrlHtmlTDO]) -> None: + """ + Modifies: + tdto.html_tag_info + """ for tdto in tdos: + if not tdto.url_response_info.success: + continue html_tag_info = await self.html_parser.parse( url=tdto.url_info.url, html_content=tdto.url_response_info.html, @@ -94,21 +76,9 @@ async def process_html_data(self, tdos: list[UrlHtmlTDO]): ) tdto.html_tag_info = html_tag_info - async def update_html_data_in_database(self, tdos: list[UrlHtmlTDO]): - html_content_infos = [] - raw_html_data = [] - for tdto in tdos: - hcig = HTMLContentInfoGetter( - response_html_info=tdto.html_tag_info, - url_id=tdto.url_info.id - ) - rhi = RawHTMLInfo( - url_id=tdto.url_info.id, - html=tdto.url_response_info.html - ) - raw_html_data.append(rhi) - results = hcig.get_all_html_content() - html_content_infos.extend(results) + async def _update_html_data_in_database(self, tdos: list[UrlHtmlTDO]) -> None: + await self.adb_client.run_query_builder( + InsertURLHTMLInfoQueryBuilder(tdos, task_id=self.task_id) + ) + - await self.adb_client.add_html_content_infos(html_content_infos) - await self.adb_client.add_raw_html(raw_html_data) diff --git a/src/core/tasks/url/operators/html/filter.py b/src/core/tasks/url/operators/html/filter.py index f14840e6..86da0e8a 100644 --- a/src/core/tasks/url/operators/html/filter.py +++ b/src/core/tasks/url/operators/html/filter.py @@ -1,44 +1,13 @@ from http import HTTPStatus -from src.core.tasks.url.operators.html.models.subsets.error_404 import ErrorSubsets -from src.core.tasks.url.operators.html.models.subsets.success_error import SuccessErrorSubset from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO -async def get_just_urls(tdos: list[UrlHtmlTDO]): +async def filter_just_urls(tdos: list[UrlHtmlTDO]): return [task_info.url_info.url for task_info in tdos] - -async def separate_success_and_error_subsets( - tdos: list[UrlHtmlTDO] -) -> SuccessErrorSubset: - errored_tdos = [] - successful_tdos = [] - for tdto in tdos: - if not tdto.url_response_info.success: - errored_tdos.append(tdto) - else: - successful_tdos.append(tdto) - return SuccessErrorSubset( - success=successful_tdos, - error=errored_tdos - ) - - -async def separate_404_and_non_404_subsets( - tdos: list[UrlHtmlTDO] -) -> ErrorSubsets: - tdos_error = [] - tdos_404 = [] - for tdo in tdos: - if tdo.url_response_info.status is None: - tdos_error.append(tdo) - continue - if tdo.url_response_info.status == HTTPStatus.NOT_FOUND: - tdos_404.append(tdo) - else: - tdos_error.append(tdo) - return ErrorSubsets( - not_404=tdos_error, - is_404=tdos_404 - ) +async def filter_404_subset(tdos: list[UrlHtmlTDO]) -> list[UrlHtmlTDO]: + return [ + tdo for tdo in tdos + if tdo.url_response_info.status == HTTPStatus.NOT_FOUND + ] diff --git a/src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/html/queries/get.py similarity index 100% rename from src/core/tasks/url/operators/html/queries/get_pending_urls_without_html_data.py rename to src/core/tasks/url/operators/html/queries/get.py diff --git a/src/core/tasks/url/operators/html/queries/insert/__init__.py b/src/core/tasks/url/operators/html/queries/insert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py new file mode 100644 index 00000000..9c9906d8 --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -0,0 +1,73 @@ +from http import HTTPStatus + +from src.core.tasks.url.operators.html.content_info_getter import HTMLContentInfoGetter +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel +from src.db.utils.compression import compress_html +from src.external.url_request.dtos.url_response import URLResponseInfo + + +def convert_to_compressed_html(tdos: list[UrlHtmlTDO]) -> list[URLCompressedHTMLPydantic]: + models = [] + for tdo in tdos: + if tdo.url_response_info.status != HTTPStatus.OK: + continue + model = URLCompressedHTMLPydantic( + url_id=tdo.url_info.id, + compressed_html=compress_html(tdo.url_response_info.html) + ) + models.append(model) + return models + + + +def _convert_to_html_content_info_getter(tdo: UrlHtmlTDO) -> HTMLContentInfoGetter: + return HTMLContentInfoGetter( + response_html_info=tdo.html_tag_info, + url_id=tdo.url_info.id + ) + +def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]: + html_content_infos = [] + for tdo in tdos: + if tdo.url_response_info.status != HTTPStatus.OK: + continue + hcig = _convert_to_html_content_info_getter(tdo) + results = hcig.get_all_html_content() + html_content_infos.extend(results) + return html_content_infos + +def get_scrape_status(response_info: URLResponseInfo) -> ScrapeStatus: + if response_info.success: + return ScrapeStatus.SUCCESS + return ScrapeStatus.ERROR + +def convert_to_scrape_infos(tdos: list[UrlHtmlTDO]) -> list[URLScrapeInfoInsertModel]: + models = [] + for tdo in tdos: + model = URLScrapeInfoInsertModel( + url_id=tdo.url_info.id, + status=get_scrape_status(tdo.url_response_info) + ) + models.append(model) + return models + +def convert_to_url_errors( + tdos: list[UrlHtmlTDO], + task_id: int +) -> list[URLErrorPydanticInfo]: + models = [] + for tdo in tdos: + if tdo.url_response_info.success: + continue + model = URLErrorPydanticInfo( + url_id=tdo.url_info.id, + error=tdo.url_response_info.exception, + task_id=task_id + ) + models.append(model) + return models \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/queries/insert/query.py b/src/core/tasks/url/operators/html/queries/insert/query.py new file mode 100644 index 00000000..e0bff2e6 --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/insert/query.py @@ -0,0 +1,30 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \ + convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class InsertURLHTMLInfoQueryBuilder(QueryBuilderBase): + + def __init__(self, tdos: list[UrlHtmlTDO], task_id: int): + super().__init__() + self.tdos = tdos + self.task_id = task_id + + async def run(self, session: AsyncSession) -> None: + compressed_html_models = convert_to_compressed_html(self.tdos) + url_html_content_list = convert_to_html_content_info_list(self.tdos) + scrape_info_list = convert_to_scrape_infos(self.tdos) + url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id) + + for models in [ + compressed_html_models, + url_html_content_list, + scrape_info_list, + url_errors + ]: + await sh.bulk_insert(session, models=models) + + diff --git a/src/core/tasks/url/operators/html/scraper/parser/mapping.py b/src/core/tasks/url/operators/html/scraper/parser/mapping.py index 6b5f0b83..641af779 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/mapping.py +++ b/src/core/tasks/url/operators/html/scraper/parser/mapping.py @@ -1,4 +1,4 @@ -from src.db.dtos.url.html_content import HTMLContentType +from src.db.models.instantiations.url.html.content.enums import HTMLContentType ENUM_TO_ATTRIBUTE_MAPPING = { HTMLContentType.TITLE: "title", diff --git a/src/core/tasks/url/operators/html/tdo.py b/src/core/tasks/url/operators/html/tdo.py index 98bd12da..6395e363 100644 --- a/src/core/tasks/url/operators/html/tdo.py +++ b/src/core/tasks/url/operators/html/tdo.py @@ -9,6 +9,6 @@ class UrlHtmlTDO(BaseModel): url_info: URLInfo - url_response_info: Optional[URLResponseInfo] = None - html_tag_info: Optional[ResponseHTMLInfo] = None + url_response_info: URLResponseInfo | None = None + html_tag_info: ResponseHTMLInfo | None = None diff --git a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index e87fcaac..ed411bd6 100644 --- a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo -from src.db.dtos.url.html_content import HTMLContentType +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 69c88cbe..9bc29ed8 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -85,7 +85,7 @@ from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO -from src.core.tasks.url.operators.html.queries.get_pending_urls_without_html_data import \ +from src.core.tasks.url.operators.html.queries.get import \ GetPendingURLsWithoutHTMLDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ GetPendingURLsMissingMiscellaneousDataQueryBuilder @@ -120,13 +120,13 @@ from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -136,7 +136,8 @@ from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion -from src.db.models.templates import Base +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.templates_.base import Base from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder @@ -244,8 +245,9 @@ async def bulk_insert( self, session: AsyncSession, models: list[BulkInsertableModel], - ): - return await sh.bulk_insert(session, models) + return_ids: bool = False + ) -> list[int] | None: + return await sh.bulk_insert(session, models=models, return_ids=return_ids) @session_manager async def scalar(self, session: AsyncSession, statement): @@ -1444,6 +1446,8 @@ async def mark_all_as_duplicates(self, url_ids: List[int]): async def mark_all_as_404(self, url_ids: List[int]): query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.NOT_FOUND.value) await self.execute(query) + query = update(URLWebMetadata).where(URLWebMetadata.id.in_(url_ids)).values(status_code=404) + await self.execute(query) async def mark_all_as_recently_probed_for_404( self, diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 3f23f56e..613c335b 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -14,7 +14,7 @@ from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 4f21c8c2..869b8978 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -7,13 +7,14 @@ from src.core.enums import RecordType, SuggestionType from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING -from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion diff --git a/src/db/dtos/url/html_content.py b/src/db/dtos/url/html_content.py index f8b24eb0..1d3d67bf 100644 --- a/src/db/dtos/url/html_content.py +++ b/src/db/dtos/url/html_content.py @@ -1,21 +1,15 @@ -from enum import Enum -from typing import Optional +from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel -from pydantic import BaseModel - -class HTMLContentType(Enum): - TITLE = "Title" - DESCRIPTION = "Description" - H1 = "H1" - H2 = "H2" - H3 = "H3" - H4 = "H4" - H5 = "H5" - H6 = "H6" - DIV = "Div" - -class URLHTMLContentInfo(BaseModel): - url_id: Optional[int] = None +class URLHTMLContentInfo(BulkInsertableModel): + url_id: int | None = None content_type: HTMLContentType - content: str | list[str] \ No newline at end of file + content: str | list[str] + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLHTMLContent \ No newline at end of file diff --git a/src/db/helpers/session/parser.py b/src/db/helpers/session/parser.py index bc822022..b580dcd1 100644 --- a/src/db/helpers/session/parser.py +++ b/src/db/helpers/session/parser.py @@ -1,5 +1,5 @@ from src.db.helpers.session.types import BulkActionType -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.protocols.sa_correlated.core import SQLAlchemyCorrelatedProtocol from src.db.templates.protocols.sa_correlated.with_id import SQLAlchemyCorrelatedWithIDProtocol from src.db.utils.validate import validate_all_models_of_same_type diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 9736cd9e..a616664f 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -11,7 +11,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session.parser import BulkActionParser -from src.db.models.templates import Base, StandardBase +from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel @@ -92,7 +93,7 @@ async def add( async def add_all( session: AsyncSession, - models: list[StandardBase], + models: list[WithIDBase], return_ids: bool = False ) -> list[int] | None: session.add_all(models) diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/instantiations/agency/pydantic/upsert.py index 9a869e84..1deeb6b5 100644 --- a/src/db/models/instantiations/agency/pydantic/upsert.py +++ b/src/db/models/instantiations/agency/pydantic/upsert.py @@ -1,7 +1,7 @@ from datetime import datetime from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.upsert import BulkUpsertableModel diff --git a/src/db/models/instantiations/agency/sqlalchemy.py b/src/db/models/instantiations/agency/sqlalchemy.py index 2ce3676f..8310eeac 100644 --- a/src/db/models/instantiations/agency/sqlalchemy.py +++ b/src/db/models/instantiations/agency/sqlalchemy.py @@ -6,13 +6,13 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import Base, StandardBase +from src.db.models.templates_.with_id import WithIDBase class Agency( CreatedAtMixin, # When agency was added to database UpdatedAtMixin, # When agency was last updated in database - StandardBase + WithIDBase ): __tablename__ = "agencies" diff --git a/src/db/models/instantiations/backlog_snapshot.py b/src/db/models/instantiations/backlog_snapshot.py index 89645160..6b0982cd 100644 --- a/src/db/models/instantiations/backlog_snapshot.py +++ b/src/db/models/instantiations/backlog_snapshot.py @@ -1,10 +1,10 @@ from sqlalchemy import Column, Integer from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class BacklogSnapshot(CreatedAtMixin, StandardBase): +class BacklogSnapshot(CreatedAtMixin, WithIDBase): __tablename__ = "backlog_snapshot" count_pending_total = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/batch/sqlalchemy.py b/src/db/models/instantiations/batch/sqlalchemy.py index b001dbac..0e6aa611 100644 --- a/src/db/models/instantiations/batch/sqlalchemy.py +++ b/src/db/models/instantiations/batch/sqlalchemy.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import batch_status_enum -class Batch(StandardBase): +class Batch(WithIDBase): __tablename__ = 'batches' strategy = Column( diff --git a/src/db/models/instantiations/change_log.py b/src/db/models/instantiations/change_log.py index 975958ab..0cb74659 100644 --- a/src/db/models/instantiations/change_log.py +++ b/src/db/models/instantiations/change_log.py @@ -5,10 +5,10 @@ from src.db.enums import ChangeLogOperationType from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class ChangeLog(CreatedAtMixin, StandardBase): +class ChangeLog(CreatedAtMixin, WithIDBase): __tablename__ = "change_log" diff --git a/src/db/models/instantiations/duplicate/sqlalchemy.py b/src/db/models/instantiations/duplicate/sqlalchemy.py index 67df3af5..03c492e3 100644 --- a/src/db/models/instantiations/duplicate/sqlalchemy.py +++ b/src/db/models/instantiations/duplicate/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Duplicate(BatchDependentMixin, StandardBase): +class Duplicate(BatchDependentMixin, WithIDBase): """ Identifies duplicates which occur within a batch """ diff --git a/src/db/models/instantiations/link/batch_url.py b/src/db/models/instantiations/link/batch_url.py index d86b0703..8fb8f42e 100644 --- a/src/db/models/instantiations/link/batch_url.py +++ b/src/db/models/instantiations/link/batch_url.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin, BatchDependentMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class LinkBatchURL( @@ -9,7 +9,7 @@ class LinkBatchURL( CreatedAtMixin, URLDependentMixin, BatchDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "link_batch_urls" diff --git a/src/db/models/instantiations/link/task_url.py b/src/db/models/instantiations/link/task_url.py index 02ef02c3..2535d317 100644 --- a/src/db/models/instantiations/link/task_url.py +++ b/src/db/models/instantiations/link/task_url.py @@ -1,6 +1,6 @@ from sqlalchemy import UniqueConstraint, Column, Integer, ForeignKey -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class LinkTaskURL(Base): diff --git a/src/db/models/instantiations/link/url_agency/sqlalchemy.py b/src/db/models/instantiations/link/url_agency/sqlalchemy.py index 28e42924..f8d72065 100644 --- a/src/db/models/instantiations/link/url_agency/sqlalchemy.py +++ b/src/db/models/instantiations/link/url_agency/sqlalchemy.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class LinkURLAgency(URLDependentMixin, StandardBase): +class LinkURLAgency(URLDependentMixin, WithIDBase): __tablename__ = "link_urls_agencies" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/instantiations/log/sqlalchemy.py b/src/db/models/instantiations/log/sqlalchemy.py index 769391cf..60f17875 100644 --- a/src/db/models/instantiations/log/sqlalchemy.py +++ b/src/db/models/instantiations/log/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Log(CreatedAtMixin, BatchDependentMixin, StandardBase): +class Log(CreatedAtMixin, BatchDependentMixin, WithIDBase): __tablename__ = 'logs' log = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/missing.py b/src/db/models/instantiations/missing.py index 05665eba..6ad868df 100644 --- a/src/db/models/instantiations/missing.py +++ b/src/db/models/instantiations/missing.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Missing(BatchDependentMixin, StandardBase): +class Missing(BatchDependentMixin, WithIDBase): __tablename__ = 'missing' place_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/root_url_cache.py b/src/db/models/instantiations/root_url_cache.py index 4ebadd50..f79e4b5c 100644 --- a/src/db/models/instantiations/root_url_cache.py +++ b/src/db/models/instantiations/root_url_cache.py @@ -1,10 +1,10 @@ from sqlalchemy import UniqueConstraint, Column, String from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class RootURL(UpdatedAtMixin, StandardBase): +class RootURL(UpdatedAtMixin, WithIDBase): __tablename__ = 'root_url_cache' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/state/huggingface.py b/src/db/models/instantiations/state/huggingface.py index 58e54cdc..d858dc0a 100644 --- a/src/db/models/instantiations/state/huggingface.py +++ b/src/db/models/instantiations/state/huggingface.py @@ -1,6 +1,6 @@ from sqlalchemy import Column, Integer, DateTime -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class HuggingFaceUploadState(Base): diff --git a/src/db/models/instantiations/state/sync/agencies.py b/src/db/models/instantiations/state/sync/agencies.py index 207a2936..7ee1babe 100644 --- a/src/db/models/instantiations/state/sync/agencies.py +++ b/src/db/models/instantiations/state/sync/agencies.py @@ -4,7 +4,7 @@ from sqlalchemy import DateTime, Date, Integer, Column -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class AgenciesSyncState(Base): diff --git a/src/db/models/instantiations/state/sync/data_sources.py b/src/db/models/instantiations/state/sync/data_sources.py index cf173860..333d0945 100644 --- a/src/db/models/instantiations/state/sync/data_sources.py +++ b/src/db/models/instantiations/state/sync/data_sources.py @@ -1,6 +1,6 @@ from sqlalchemy import Integer, Column, DateTime, Date -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class DataSourcesSyncState(Base): diff --git a/src/db/models/instantiations/task/core.py b/src/db/models/instantiations/task/core.py index 514301c8..291a5d0a 100644 --- a/src/db/models/instantiations/task/core.py +++ b/src/db/models/instantiations/task/core.py @@ -3,11 +3,11 @@ from src.db.enums import PGEnum, TaskType from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import batch_status_enum -class Task(UpdatedAtMixin, StandardBase): +class Task(UpdatedAtMixin, WithIDBase): __tablename__ = 'tasks' task_type = Column( diff --git a/src/db/models/instantiations/task/error.py b/src/db/models/instantiations/task/error.py index 03014904..c5a25e78 100644 --- a/src/db/models/instantiations/task/error.py +++ b/src/db/models/instantiations/task/error.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardBase): +class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): __tablename__ = 'task_errors' error = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/url/checked_for_duplicate.py b/src/db/models/instantiations/url/checked_for_duplicate.py index 9443d0ac..bb7cf666 100644 --- a/src/db/models/instantiations/url/checked_for_duplicate.py +++ b/src/db/models/instantiations/url/checked_for_duplicate.py @@ -1,10 +1,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardBase): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = 'url_checked_for_duplicate' # Relationships diff --git a/src/db/models/instantiations/url/core/pydantic/insert.py b/src/db/models/instantiations/url/core/pydantic/insert.py index 230c93c0..e384416e 100644 --- a/src/db/models/instantiations/url/core/pydantic/insert.py +++ b/src/db/models/instantiations/url/core/pydantic/insert.py @@ -1,7 +1,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/core/sqlalchemy.py b/src/db/models/instantiations/url/core/sqlalchemy.py index 8a476071..4b4c0159 100644 --- a/src/db/models/instantiations/url/core/sqlalchemy.py +++ b/src/db/models/instantiations/url/core/sqlalchemy.py @@ -1,16 +1,14 @@ -from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON, Enum -from sqlalchemy.dialects import postgresql +from sqlalchemy import Column, Text, String, JSON from sqlalchemy.orm import relationship from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.helpers import enum_column from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardBase -from src.db.models.types import record_type_values +from src.db.models.templates_.with_id import WithIDBase -class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): +class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): __tablename__ = 'urls' # The batch this URL is associated with @@ -84,4 +82,12 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): "URLCompressedHTML", uselist=False, back_populates="url" + ) + scrape_info = relationship( + "URLScrapeInfo", + uselist=False, + ) + web_metadata = relationship( + "URLWebMetadata", + uselist=False, ) \ No newline at end of file diff --git a/src/db/models/instantiations/url/data_source/sqlalchemy.py b/src/db/models/instantiations/url/data_source/sqlalchemy.py index b5bdb40d..270ba7e3 100644 --- a/src/db/models/instantiations/url/data_source/sqlalchemy.py +++ b/src/db/models/instantiations/url/data_source/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardBase): +class URLDataSource(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "url_data_sources" data_source_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/error_info/pydantic.py b/src/db/models/instantiations/url/error_info/pydantic.py index 46f5b9fa..c8596a13 100644 --- a/src/db/models/instantiations/url/error_info/pydantic.py +++ b/src/db/models/instantiations/url/error_info/pydantic.py @@ -3,9 +3,17 @@ from pydantic import BaseModel +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel -class URLErrorPydanticInfo(BaseModel): + +class URLErrorPydanticInfo(BulkInsertableModel): task_id: int url_id: int error: str - updated_at: Optional[datetime.datetime] = None \ No newline at end of file + updated_at: datetime.datetime = None + + @classmethod + def sa_model(cls) -> type[Base]: + return URLErrorInfo \ No newline at end of file diff --git a/src/db/models/instantiations/url/error_info/sqlalchemy.py b/src/db/models/instantiations/url/error_info/sqlalchemy.py index 8825777f..59f6c263 100644 --- a/src/db/models/instantiations/url/error_info/sqlalchemy.py +++ b/src/db/models/instantiations/url/error_info/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardBase): +class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, WithIDBase): __tablename__ = 'url_error_info' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/html/__init__.py b/src/db/models/instantiations/url/html/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/compressed/__init__.py b/src/db/models/instantiations/url/html/compressed/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/compressed/pydantic.py b/src/db/models/instantiations/url/html/compressed/pydantic.py new file mode 100644 index 00000000..b626b5c2 --- /dev/null +++ b/src/db/models/instantiations/url/html/compressed/pydantic.py @@ -0,0 +1,13 @@ +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLCompressedHTMLPydantic(BulkInsertableModel): + url_id: int + compressed_html: bytes + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLCompressedHTML \ No newline at end of file diff --git a/src/db/models/instantiations/url/compressed_html.py b/src/db/models/instantiations/url/html/compressed/sqlalchemy.py similarity index 86% rename from src/db/models/instantiations/url/compressed_html.py rename to src/db/models/instantiations/url/html/compressed/sqlalchemy.py index 92e340a5..995c5b25 100644 --- a/src/db/models/instantiations/url/compressed_html.py +++ b/src/db/models/instantiations/url/html/compressed/sqlalchemy.py @@ -2,13 +2,13 @@ from sqlalchemy.orm import relationship, Mapped from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = 'url_compressed_html' diff --git a/src/db/models/instantiations/url/html/content/__init__.py b/src/db/models/instantiations/url/html/content/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/content/enums.py b/src/db/models/instantiations/url/html/content/enums.py new file mode 100644 index 00000000..13820352 --- /dev/null +++ b/src/db/models/instantiations/url/html/content/enums.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class HTMLContentType(Enum): + TITLE = "Title" + DESCRIPTION = "Description" + H1 = "H1" + H2 = "H2" + H3 = "H3" + H4 = "H4" + H5 = "H5" + H6 = "H6" + DIV = "Div" diff --git a/src/db/models/instantiations/url/html/content/pydantic.py b/src/db/models/instantiations/url/html/content/pydantic.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html_content.py b/src/db/models/instantiations/url/html/content/sqlalchemy.py similarity index 82% rename from src/db/models/instantiations/url/html_content.py rename to src/db/models/instantiations/url/html/content/sqlalchemy.py index b23af35c..63e4da76 100644 --- a/src/db/models/instantiations/url/html_content.py +++ b/src/db/models/instantiations/url/html/content/sqlalchemy.py @@ -3,10 +3,14 @@ from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardBase): +class URLHTMLContent( + UpdatedAtMixin, + URLDependentMixin, + WithIDBase +): __tablename__ = 'url_html_content' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/optional_data_source_metadata.py b/src/db/models/instantiations/url/optional_data_source_metadata.py index fac99828..bb2a95e5 100644 --- a/src/db/models/instantiations/url/optional_data_source_metadata.py +++ b/src/db/models/instantiations/url/optional_data_source_metadata.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLOptionalDataSourceMetadata(URLDependentMixin, StandardBase): +class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) diff --git a/src/db/models/instantiations/url/probed_for_404.py b/src/db/models/instantiations/url/probed_for_404.py index b795b628..478ce9de 100644 --- a/src/db/models/instantiations/url/probed_for_404.py +++ b/src/db/models/instantiations/url/probed_for_404.py @@ -2,10 +2,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLProbedFor404(URLDependentMixin, StandardBase): +class URLProbedFor404(URLDependentMixin, WithIDBase): __tablename__ = 'url_probed_for_404' last_probed_at = get_created_at_column() diff --git a/src/db/models/instantiations/url/reviewing_user.py b/src/db/models/instantiations/url/reviewing_user.py index 938f86ab..9213a157 100644 --- a/src/db/models/instantiations/url/reviewing_user.py +++ b/src/db/models/instantiations/url/reviewing_user.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardBase): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = 'reviewing_user_url' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/url/scrape_info/__init__.py b/src/db/models/instantiations/url/scrape_info/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/scrape_info/enums.py b/src/db/models/instantiations/url/scrape_info/enums.py new file mode 100644 index 00000000..3e16fff3 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/enums.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class ScrapeStatus(Enum): + SUCCESS = "success" + ERROR = "error" \ No newline at end of file diff --git a/src/db/models/instantiations/url/scrape_info/pydantic.py b/src/db/models/instantiations/url/scrape_info/pydantic.py new file mode 100644 index 00000000..f41b1642 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/pydantic.py @@ -0,0 +1,13 @@ +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLScrapeInfoInsertModel(BulkInsertableModel): + url_id: int + status: ScrapeStatus + + @classmethod + def sa_model(cls) -> type[Base]: + return URLScrapeInfo \ No newline at end of file diff --git a/src/db/models/instantiations/url/scrape_info/sqlalchemy.py b/src/db/models/instantiations/url/scrape_info/sqlalchemy.py new file mode 100644 index 00000000..d97e0b93 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/sqlalchemy.py @@ -0,0 +1,17 @@ +from src.db.models.helpers import enum_column +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.standard import StandardBase + + +class URLScrapeInfo( + StandardBase, + URLDependentMixin +): + + __tablename__ = 'url_scrape_info' + + status = enum_column( + enum_type=ScrapeStatus, + name='scrape_status', + ) \ No newline at end of file diff --git a/src/db/models/instantiations/url/suggestion/agency/auto.py b/src/db/models/instantiations/url/suggestion/agency/auto.py index 01585535..5ecfdf0a 100644 --- a/src/db/models/instantiations/url/suggestion/agency/auto.py +++ b/src/db/models/instantiations/url/suggestion/agency/auto.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): +class AutomatedUrlAgencySuggestion(URLDependentMixin, WithIDBase): __tablename__ = "automated_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/agency/user.py b/src/db/models/instantiations/url/suggestion/agency/user.py index 5a54399f..7a338fd0 100644 --- a/src/db/models/instantiations/url/suggestion/agency/user.py +++ b/src/db/models/instantiations/url/suggestion/agency/user.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class UserUrlAgencySuggestion(URLDependentMixin, StandardBase): +class UserUrlAgencySuggestion(URLDependentMixin, WithIDBase): __tablename__ = "user_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/record_type/auto.py b/src/db/models/instantiations/url/suggestion/record_type/auto.py index 34faf6f3..2aaed526 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/auto.py +++ b/src/db/models/instantiations/url/suggestion/record_type/auto.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values @@ -11,7 +11,7 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/record_type/user.py b/src/db/models/instantiations/url/suggestion/record_type/user.py index 77954509..8fcc816b 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/user.py +++ b/src/db/models/instantiations/url/suggestion/record_type/user.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): +class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "user_record_type_suggestions" user_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py index 982b4449..49dc7457 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py +++ b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): +class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/relevant/user.py b/src/db/models/instantiations/url/suggestion/relevant/user.py index b087f71e..a0cfed44 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/user.py +++ b/src/db/models/instantiations/url/suggestion/relevant/user.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class UserRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "user_relevant_suggestions" diff --git a/src/db/models/instantiations/url/web_metadata/pydantic.py b/src/db/models/instantiations/url/web_metadata/pydantic.py index 31a05d4a..c0460437 100644 --- a/src/db/models/instantiations/url/web_metadata/pydantic.py +++ b/src/db/models/instantiations/url/web_metadata/pydantic.py @@ -1,5 +1,5 @@ from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py index 903bdc43..45f5233c 100644 --- a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py +++ b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py @@ -1,11 +1,11 @@ from sqlalchemy import Column, Text, Boolean, Integer from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class URLWebMetadata( - StandardBase, + WithIDBase, URLDependentMixin, CreatedAtMixin, UpdatedAtMixin diff --git a/src/db/models/templates.py b/src/db/models/templates.py deleted file mode 100644 index 5e738fab..00000000 --- a/src/db/models/templates.py +++ /dev/null @@ -1,11 +0,0 @@ -from sqlalchemy import Integer, Column -from sqlalchemy.orm import declarative_base - -# Base class for SQLAlchemy ORM models -Base = declarative_base() - -class StandardBase(Base): - __abstract__ = True - - id = Column(Integer, primary_key=True, autoincrement=True) - diff --git a/src/db/models/templates_/__init__.py b/src/db/models/templates_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/templates_/base.py b/src/db/models/templates_/base.py new file mode 100644 index 00000000..0ec5f68e --- /dev/null +++ b/src/db/models/templates_/base.py @@ -0,0 +1,4 @@ +"""Base class for SQLAlchemy ORM models.""" +from sqlalchemy.orm import declarative_base + +Base = declarative_base() diff --git a/src/db/models/templates_/standard.py b/src/db/models/templates_/standard.py new file mode 100644 index 00000000..85a01941 --- /dev/null +++ b/src/db/models/templates_/standard.py @@ -0,0 +1,14 @@ +from sqlalchemy import Column, Integer + +from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class StandardBase( + Base, + CreatedAtMixin, + UpdatedAtMixin, +): + __abstract__ = True + + id = Column(Integer, primary_key=True, autoincrement=True) diff --git a/src/db/models/templates_/with_id.py b/src/db/models/templates_/with_id.py new file mode 100644 index 00000000..e454f215 --- /dev/null +++ b/src/db/models/templates_/with_id.py @@ -0,0 +1,11 @@ +from sqlalchemy import Integer, Column + +from src.db.models.templates_.base import Base + + + +class WithIDBase(Base): + __abstract__ = True + + id = Column(Integer, primary_key=True, autoincrement=True) + diff --git a/src/db/queries/implementations/core/get/html_content_info.py b/src/db/queries/implementations/core/get/html_content_info.py index fb26a527..d647acc1 100644 --- a/src/db/queries/implementations/core/get/html_content_info.py +++ b/src/db/queries/implementations/core/get/html_content_info.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 5af4ba5c..2e9a69e8 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -1,3 +1,4 @@ +from http import HTTPStatus from typing import Any from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement @@ -11,10 +12,11 @@ from src.db.models.instantiations.link.task_url import LinkTaskURL from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -38,19 +40,13 @@ def has_non_errored_urls_without_html_data() -> Select: query = ( select(URL) .join(URLWebMetadata) - .outerjoin(URLHTMLContent) - .where(URLHTMLContent.id == None) - .where(~exists(exclude_subquery)) - .where(URLWebMetadata.content_type.like("%html%")) - .where(URL.outcome.in_( - [ - URLStatus.PENDING, - URLStatus.NOT_RELEVANT, - URLStatus.INDIVIDUAL_RECORD, - URLStatus.SUBMITTED, - URLStatus.VALIDATED - ] - )) + .outerjoin(URLScrapeInfo) + .where( + URLScrapeInfo.id == None, + ~exists(exclude_subquery), + URLWebMetadata.status_code == HTTPStatus.OK.value, + URLWebMetadata.content_type.like("%html%"), + ) .options( selectinload(URL.batch) ) diff --git a/src/db/templates/protocols/sa_correlated/core.py b/src/db/templates/protocols/sa_correlated/core.py index 6b77c835..82475e60 100644 --- a/src/db/templates/protocols/sa_correlated/core.py +++ b/src/db/templates/protocols/sa_correlated/core.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import Protocol, runtime_checkable -from src.db.models.templates import Base +from src.db.models.templates_.base import Base @runtime_checkable diff --git a/src/db/templates/protocols/sa_correlated/with_id.py b/src/db/templates/protocols/sa_correlated/with_id.py index 4e3609e1..7e920e76 100644 --- a/src/db/templates/protocols/sa_correlated/with_id.py +++ b/src/db/templates/protocols/sa_correlated/with_id.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import Protocol, runtime_checkable -from src.db.models.templates import Base +from src.db.models.templates_.base import Base @runtime_checkable diff --git a/src/external/url_request/dtos/url_response.py b/src/external/url_request/dtos/url_response.py index 8e17c078..57303a7c 100644 --- a/src/external/url_request/dtos/url_response.py +++ b/src/external/url_request/dtos/url_response.py @@ -6,7 +6,7 @@ class URLResponseInfo(BaseModel): success: bool - status: Optional[HTTPStatus] = None - html: Optional[str] = None - content_type: Optional[str] = None - exception: Optional[str] = None + status: HTTPStatus | None = None + html: str | None = None + content_type: str | None = None + exception: str | None = None diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 3eb18773..13327bfd 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -61,7 +61,8 @@ def id_column() -> sa.Column: sa.Integer(), primary_key=True, autoincrement=True, - nullable=False + nullable=False, + comment='The primary identifier for the row.' ) def created_at_column() -> sa.Column: @@ -70,7 +71,8 @@ def created_at_column() -> sa.Column: 'created_at', sa.DateTime(), server_default=sa.text('now()'), - nullable=False + nullable=False, + comment='The time the row was created.' ) def updated_at_column() -> sa.Column: @@ -80,7 +82,8 @@ def updated_at_column() -> sa.Column: sa.DateTime(), server_default=sa.text('now()'), server_onupdate=sa.text('now()'), - nullable=False + nullable=False, + comment='The last time the row was updated.' ) def url_id_column() -> sa.Column: @@ -91,7 +94,8 @@ def url_id_column() -> sa.Column: 'urls.id', ondelete='CASCADE' ), - nullable=False + nullable=False, + comment='A foreign key to the `urls` table.' ) def batch_id_column(nullable=False) -> sa.Column: @@ -102,5 +106,6 @@ def batch_id_column(nullable=False) -> sa.Column: 'batches.id', ondelete='CASCADE' ), - nullable=nullable + nullable=nullable, + comment='A foreign key to the `batches` table.' ) \ No newline at end of file diff --git a/tests/automated/integration/db/structure/testers/table.py b/tests/automated/integration/db/structure/testers/table.py index aed5d3a5..a91c0837 100644 --- a/tests/automated/integration/db/structure/testers/table.py +++ b/tests/automated/integration/db/structure/testers/table.py @@ -7,7 +7,7 @@ from sqlalchemy.exc import DataError from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from tests.automated.integration.db.structure.testers.models.column import ColumnTester from tests.automated.integration.db.structure.types import ConstraintTester, SATypes diff --git a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py index dc0a3452..8e345d51 100644 --- a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html diff --git a/tests/automated/integration/tasks/url/html/asserts.py b/tests/automated/integration/tasks/url/html/asserts.py deleted file mode 100644 index 9ca241cd..00000000 --- a/tests/automated/integration/tasks/url/html/asserts.py +++ /dev/null @@ -1,52 +0,0 @@ -from src.api.endpoints.task.by_id.dto import TaskInfo -from src.collectors.enums import URLStatus -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from tests.automated.integration.tasks.url.html.mocks.constants import MOCK_HTML_CONTENT - - -async def assert_success_url_has_two_html_content_entries( - adb: AsyncDatabaseClient, - run_info, - url_id: int -): - await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) - hci = await adb.get_html_content_info(url_id=url_id) - assert len(hci) == 2 - -async def assert_url_has_one_compressed_html_content_entry( - adb: AsyncDatabaseClient, - url_id: int -): - html = await adb.get_html_for_url(url_id=url_id) - assert html == MOCK_HTML_CONTENT - -async def assert_success_url_has_one_compressed_html_content_entry( - adb: AsyncDatabaseClient, - run_info, - url_id: int -): - await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) - hci = await adb.get_html_content_info(url_id=url_id) - assert len(hci) == 1 - -async def assert_404_url_has_404_status( - adb: AsyncDatabaseClient, - url_id: int -): - url_info_404 = await adb.get_url_info_by_id(url_id=url_id) - assert url_info_404.outcome == URLStatus.NOT_FOUND - - -def assert_task_has_one_url_error(task_info): - assert len(task_info.url_errors) == 1 - assert task_info.url_errors[0].error == "test error" - - -def assert_task_type_is_html(task_info): - assert task_info.task_type == TaskType.HTML - - -def assert_html_task_ran_without_error(task_info: TaskInfo): - assert task_info.error_info is None diff --git a/tests/automated/integration/tasks/url/html/check/__init__.py b/tests/automated/integration/tasks/url/html/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/check/manager.py b/tests/automated/integration/tasks/url/html/check/manager.py new file mode 100644 index 00000000..accb7409 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/check/manager.py @@ -0,0 +1,66 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.automated.integration.tasks.url.html.setup.models.record import TestURLHTMLTaskSetupRecord + + +class TestURLHTMLTaskCheckManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + records: list[TestURLHTMLTaskSetupRecord] + ): + self.adb_client = adb_client + self.records = records + self._id_to_entry = {record.url_id: record.entry for record in records} + + async def check(self): + await self._check_has_html() + await self._check_scrape_status() + await self._check_has_same_url_status() + await self._check_marked_as_404() + + async def _check_has_html(self) -> None: + urls_with_html = [ + record.url_id + for record in self.records + if record.entry.expected_result.has_html + ] + + compressed_html_list: list[URLCompressedHTML] = await self.adb_client.get_all(URLCompressedHTML) + assert len(compressed_html_list) == len(urls_with_html) + for compressed_html in compressed_html_list: + assert compressed_html.url_id in urls_with_html + + async def _check_scrape_status(self) -> None: + urls_with_scrape_status = [ + record.url_id + for record in self.records + if record.entry.expected_result.scrape_status is not None + ] + + url_scrape_info_list: list[URLScrapeInfo] = await self.adb_client.get_all(URLScrapeInfo) + assert len(url_scrape_info_list) == len(urls_with_scrape_status) + for url_scrape_info in url_scrape_info_list: + assert url_scrape_info.url_id in urls_with_scrape_status + entry = self._id_to_entry[url_scrape_info.url_id] + expected_scrape_status = entry.expected_result.scrape_status + assert url_scrape_info.status == expected_scrape_status + + async def _check_has_same_url_status(self): + urls: list[URL] = await self.adb_client.get_all(URL) + for url in urls: + entry = self._id_to_entry[url.id] + if entry.expected_result.web_metadata_status_marked_404: + continue + assert url.outcome == entry.url_info.status, f"URL {url.url} has outcome {url.outcome} instead of {entry.url_info.status}" + + async def _check_marked_as_404(self): + web_metadata_list: list[URLWebMetadata] = await self.adb_client.get_all(URLWebMetadata) + for web_metadata in web_metadata_list: + entry = self._id_to_entry[web_metadata.url_id] + if entry.expected_result.web_metadata_status_marked_404: + assert web_metadata.status_code == 404, f"URL {entry.url_info.url} has status code {web_metadata.status_code} instead of 404" diff --git a/tests/automated/integration/tasks/url/html/mocks/constants.py b/tests/automated/integration/tasks/url/html/mocks/constants.py deleted file mode 100644 index 0b60341d..00000000 --- a/tests/automated/integration/tasks/url/html/mocks/constants.py +++ /dev/null @@ -1,3 +0,0 @@ - -MOCK_HTML_CONTENT = "" -MOCK_CONTENT_TYPE = "text/html" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/mocks/methods.py b/tests/automated/integration/tasks/url/html/mocks/methods.py index ddf1fc6f..d6799eea 100644 --- a/tests/automated/integration/tasks/url/html/mocks/methods.py +++ b/tests/automated/integration/tasks/url/html/mocks/methods.py @@ -1,55 +1,9 @@ -from http import HTTPStatus from typing import Optional -from aiohttp import ClientResponseError, RequestInfo - from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.external.url_request.dtos.url_response import URLResponseInfo -from tests.automated.integration.tasks.url.html.mocks.constants import MOCK_CONTENT_TYPE, MOCK_HTML_CONTENT - - -async def mock_make_requests(self, urls: list[str]) -> list[URLResponseInfo]: - results = [] - for idx, url in enumerate(urls): - # Second result should produce a 404 - if idx == 1: - results.append( - URLResponseInfo( - success=False, - content_type=MOCK_CONTENT_TYPE, - exception=str(ClientResponseError( - request_info=RequestInfo( - url=url, - method="GET", - real_url=url, - headers={}, - ), - code=HTTPStatus.NOT_FOUND.value, - history=(None,), - )), - status=HTTPStatus.NOT_FOUND - ) - ) - continue - - if idx == 2: - # 3rd result should produce an error - results.append( - URLResponseInfo( - success=False, - exception=str(ValueError("test error")), - content_type=MOCK_CONTENT_TYPE - )) - else: - # All other results should succeed - results.append(URLResponseInfo( - html=MOCK_HTML_CONTENT, success=True, content_type=MOCK_CONTENT_TYPE)) - return results async def mock_parse(self, url: str, html_content: str, content_type: str) -> ResponseHTMLInfo: - assert html_content == MOCK_HTML_CONTENT - assert content_type == MOCK_CONTENT_TYPE return ResponseHTMLInfo( url=url, title="fake title", diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/__init__.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py new file mode 100644 index 00000000..a8dde5b5 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py @@ -0,0 +1,11 @@ +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.html.mocks.url_request_interface.setup import setup_url_to_response_info + + +class MockURLRequestInterface: + + def __init__(self): + self._url_to_response_info: dict[str, URLResponseInfo] = setup_url_to_response_info() + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + return [self._url_to_response_info[url] for url in urls] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py new file mode 100644 index 00000000..cff46013 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py @@ -0,0 +1,45 @@ +from http import HTTPStatus + +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.html.setup.data import TEST_ENTRIES +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestErrorType +from tests.helpers.simple_test_data_functions import generate_test_html + + +def _get_success( + entry: TestURLHTMLTaskSetupEntry +) -> bool: + if entry.give_error is not None: + return False + return True + +def get_http_status( + entry: TestURLHTMLTaskSetupEntry +) -> HTTPStatus: + if entry.give_error is None: + return HTTPStatus.OK + if entry.give_error == TestErrorType.HTTP_404: + return HTTPStatus.NOT_FOUND + return HTTPStatus.INTERNAL_SERVER_ERROR + +def _get_content_type( + entry: TestURLHTMLTaskSetupEntry +) -> str | None: + if entry.give_error is not None: + return None + return "text/html" + + +def setup_url_to_response_info( +) -> dict[str, URLResponseInfo]: + d = {} + for entry in TEST_ENTRIES: + response_info = URLResponseInfo( + success=_get_success(entry), + status=get_http_status(entry), + html=generate_test_html() if _get_success(entry) else None, + content_type=_get_content_type(entry), + exception=None if _get_success(entry) else "Error" + ) + d[entry.url_info.url] = response_info + return d diff --git a/tests/automated/integration/tasks/url/html/setup.py b/tests/automated/integration/tasks/url/html/setup.py deleted file mode 100644 index 2d6a47a7..00000000 --- a/tests/automated/integration/tasks/url/html/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import types - -from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser - -from src.external.url_request.core import URLRequestInterface -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache -from src.db.client.async_ import AsyncDatabaseClient -from tests.automated.integration.tasks.url.html.mocks.methods import mock_make_requests, mock_get_from_cache, mock_parse - - -async def setup_mocked_url_request_interface() -> URLRequestInterface: - url_request_interface = URLRequestInterface() - url_request_interface.make_requests_with_html = types.MethodType(mock_make_requests, url_request_interface) - return url_request_interface - - -async def setup_mocked_root_url_cache() -> RootURLCache: - mock_root_url_cache = RootURLCache() - mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) - return mock_root_url_cache - - -async def setup_urls(db_data_creator) -> list[int]: - batch_id = db_data_creator.batch() - url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings - url_ids = [url_info.url_id for url_info in url_mappings] - return url_ids - - -async def setup_operator() -> URLHTMLTaskOperator: - html_parser = HTMLResponseParser( - root_url_cache=await setup_mocked_root_url_cache() - ) - html_parser.parse = types.MethodType(mock_parse, html_parser) - operator = URLHTMLTaskOperator( - adb_client=AsyncDatabaseClient(), - url_request_interface=await setup_mocked_url_request_interface(), - html_parser=html_parser - ) - return operator diff --git a/tests/automated/integration/tasks/url/html/setup/__init__.py b/tests/automated/integration/tasks/url/html/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/setup/data.py b/tests/automated/integration/tasks/url/html/setup/data.py new file mode 100644 index 00000000..9c488484 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/data.py @@ -0,0 +1,94 @@ +from http import HTTPStatus + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestURLInfo, \ + TestWebMetadataInfo, ExpectedResult, TestErrorType + +TEST_ENTRIES = [ + # URLs that give 200s should be updated with the appropriate scrape status + # and their html should be stored + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://happy-path.com/pending", + status=URLStatus.PENDING + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + expected_result=ExpectedResult( + has_html=True, # Test for both compressed HTML and content metadata + scrape_status=ScrapeStatus.SUCCESS + ) + ), + # URLs that give 404s should be updated with the appropriate scrape status + # and their web metadata status should be updated to 404 + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://not-found-path.com/submitted", + status=URLStatus.ERROR + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + give_error=TestErrorType.HTTP_404, + expected_result=ExpectedResult( + has_html=False, + scrape_status=ScrapeStatus.ERROR, + web_metadata_status_marked_404=True + ) + ), + # URLs that give errors should be updated with the appropriate scrape status + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://error-path.com/submitted", + status=URLStatus.ERROR + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + give_error=TestErrorType.SCRAPER, + expected_result=ExpectedResult( + has_html=False, + scrape_status=ScrapeStatus.ERROR + ) + ), + # URLs with non-200 web metadata should not be processed + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://not-200-path.com/submitted", + status=URLStatus.PENDING + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.PERMANENT_REDIRECT, + error_message=None + ), + expected_result=ExpectedResult( + has_html=False, + scrape_status=None + ) + ), + # URLs with no web metadata should not be processed + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://no-web-metadata.com/submitted", + status=URLStatus.PENDING + ), + web_metadata_info=None, + expected_result=ExpectedResult( + has_html=False, + scrape_status=None + ) + ) +] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/setup/manager.py b/tests/automated/integration/tasks/url/html/setup/manager.py new file mode 100644 index 00000000..8e679a57 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/manager.py @@ -0,0 +1,87 @@ +import types + +from src.core.enums import RecordType +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from tests.automated.integration.tasks.url.html.mocks.methods import mock_get_from_cache, mock_parse +from tests.automated.integration.tasks.url.html.mocks.url_request_interface.core import MockURLRequestInterface +from tests.automated.integration.tasks.url.html.setup.data import TEST_ENTRIES +from tests.automated.integration.tasks.url.html.setup.models.record import TestURLHTMLTaskSetupRecord + + +class TestURLHTMLTaskSetupManager: + + def __init__(self, adb_client: AsyncDatabaseClient): + self.adb_client = adb_client + + + async def setup(self) -> list[TestURLHTMLTaskSetupRecord]: + + records = await self._setup_urls() + await self.setup_web_metadata(records) + return records + + async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: + url_insert_models: list[URLInsertModel] = [] + for entry in TEST_ENTRIES: + url_insert_model = URLInsertModel( + outcome=entry.url_info.status, + url=entry.url_info.url, + name=f"Test for {entry.url_info.url}", + record_type=RecordType.RESOURCES + ) + url_insert_models.append(url_insert_model) + url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) + + records = [] + for url_id, entry in zip(url_ids, TEST_ENTRIES): + record = TestURLHTMLTaskSetupRecord( + url_id=url_id, + entry=entry + ) + records.append(record) + return records + + async def setup_web_metadata( + self, + records: list[TestURLHTMLTaskSetupRecord] + ) -> None: + models = [] + for record in records: + entry = record.entry + web_metadata_info = entry.web_metadata_info + if web_metadata_info is None: + continue + model = URLWebMetadataPydantic( + url_id=record.url_id, + accessed=web_metadata_info.accessed, + status_code=web_metadata_info.response_code.value, + content_type=web_metadata_info.content_type, + error_message=web_metadata_info.error_message + ) + models.append(model) + await self.adb_client.bulk_insert(models) + + + +async def setup_mocked_root_url_cache() -> RootURLCache: + mock_root_url_cache = RootURLCache() + mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) + return mock_root_url_cache + + +async def setup_operator() -> URLHTMLTaskOperator: + html_parser = HTMLResponseParser( + root_url_cache=await setup_mocked_root_url_cache() + ) + html_parser.parse = types.MethodType(mock_parse, html_parser) + operator = URLHTMLTaskOperator( + adb_client=AsyncDatabaseClient(), + url_request_interface=MockURLRequestInterface(), + html_parser=html_parser + ) + return operator diff --git a/tests/automated/integration/tasks/url/html/setup/models/__init__.py b/tests/automated/integration/tasks/url/html/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/setup/models/entry.py b/tests/automated/integration/tasks/url/html/setup/models/entry.py new file mode 100644 index 00000000..8cc2a8ad --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/models/entry.py @@ -0,0 +1,34 @@ +from enum import Enum +from http import HTTPStatus + +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus + + +class TestErrorType(Enum): + SCRAPER = "scraper" + HTTP_404 = "http-404" + + +class TestWebMetadataInfo(BaseModel): + accessed: bool + content_type: str | None + response_code: HTTPStatus + error_message: str | None + +class TestURLInfo(BaseModel): + url: str + status: URLStatus + +class ExpectedResult(BaseModel): + has_html: bool + scrape_status: ScrapeStatus | None # Does not have scrape info if none + web_metadata_status_marked_404: bool = False + +class TestURLHTMLTaskSetupEntry(BaseModel): + url_info: TestURLInfo + web_metadata_info: TestWebMetadataInfo | None + give_error: TestErrorType | None = None + expected_result: ExpectedResult \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/setup/models/record.py b/tests/automated/integration/tasks/url/html/setup/models/record.py new file mode 100644 index 00000000..7902dd81 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/models/record.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry + + +class TestURLHTMLTaskSetupRecord(BaseModel): + url_id: int + entry: TestURLHTMLTaskSetupEntry \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/test_task.py b/tests/automated/integration/tasks/url/html/test_task.py index da6753a4..fe059838 100644 --- a/tests/automated/integration/tasks/url/html/test_task.py +++ b/tests/automated/integration/tasks/url/html/test_task.py @@ -1,50 +1,34 @@ import pytest +from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from tests.automated.integration.tasks.url.html.asserts import assert_success_url_has_two_html_content_entries, assert_404_url_has_404_status, assert_task_has_one_url_error, \ - assert_task_type_is_html, assert_html_task_ran_without_error, assert_url_has_one_compressed_html_content_entry -from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_url_task_has_expected_run_info, \ - assert_prereqs_met -from tests.automated.integration.tasks.url.html.setup import setup_urls, setup_operator -from tests.helpers.data_creator.core import DBDataCreator +from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_prereqs_met, \ + assert_task_ran_without_error +from tests.automated.integration.tasks.url.html.check.manager import TestURLHTMLTaskCheckManager +from tests.automated.integration.tasks.url.html.setup.manager import setup_operator, \ + TestURLHTMLTaskSetupManager @pytest.mark.asyncio -async def test_url_html_task(db_data_creator: DBDataCreator): +async def test_url_html_task(adb_client_test: AsyncDatabaseClient): + setup = TestURLHTMLTaskSetupManager(adb_client_test) operator = await setup_operator() # No URLs were created, the prereqs should not be met await assert_prereqs_not_met(operator) - - # Add URLs without adding web metadata, the prereqs should not be met - url_ids = await setup_urls(db_data_creator) - await assert_prereqs_not_met(operator) - - # Add web metadata, the prereqs should be met - await db_data_creator.url_metadata(url_ids) + records = await setup.setup() await assert_prereqs_met(operator) - success_url_id = url_ids[0] - not_found_url_id = url_ids[1] - - task_id = await db_data_creator.adb_client.initiate_task(task_type=TaskType.HTML) + task_id = await adb_client_test.initiate_task(task_type=TaskType.HTML) run_info = await operator.run_task(task_id) - assert_url_task_has_expected_run_info(run_info, url_ids) - + assert_task_ran_without_error(run_info) - task_info = await db_data_creator.adb_client.get_task_info( - task_id=operator.task_id + checker = TestURLHTMLTaskCheckManager( + adb_client=adb_client_test, + records=records ) + await checker.check() - assert_html_task_ran_without_error(task_info) - assert_task_type_is_html(task_info) - assert_task_has_one_url_error(task_info) - - adb = db_data_creator.adb_client - await assert_success_url_has_two_html_content_entries(adb, run_info, success_url_id) - await assert_url_has_one_compressed_html_content_entry(adb, success_url_id) - await assert_404_url_has_404_status(adb, not_found_url_id) - - + await assert_prereqs_not_met(operator) diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index 6c9e95e3..dd947d65 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,5 +1,8 @@ -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer @@ -16,6 +19,7 @@ def __init__( async def run(self) -> None: html_content_infos = [] raw_html_info_list = [] + scraper_info_list = [] for url_id in self.url_ids: html_content_infos.append( URLHTMLContentInfo( @@ -36,6 +40,11 @@ async def run(self) -> None: html="" ) raw_html_info_list.append(raw_html_info) + scraper_info = URLScrapeInfoInsertModel( + url_id=url_id, + status=ScrapeStatus.SUCCESS, + ) + scraper_info_list.append(scraper_info) await self.adb_client.add_raw_html(raw_html_info_list) await self.adb_client.add_html_content_infos(html_content_infos) diff --git a/tests/helpers/data_creator/commands/impl/url_metadata.py b/tests/helpers/data_creator/commands/impl/url_metadata.py index 6eee58ed..9d3cf4ff 100644 --- a/tests/helpers/data_creator/commands/impl/url_metadata.py +++ b/tests/helpers/data_creator/commands/impl/url_metadata.py @@ -1,3 +1,5 @@ +from http import HTTPStatus + from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase @@ -7,11 +9,13 @@ class URLMetadataCommand(DBDataCreatorCommandBase): def __init__( self, url_ids: list[int], - content_type: str = "text/html" + content_type: str = "text/html", + status_code: int = HTTPStatus.OK.value ): super().__init__() self.url_ids = url_ids self.content_type = content_type + self.status_code = status_code async def run(self) -> None: url_metadata_infos = [] @@ -19,7 +23,7 @@ async def run(self) -> None: url_metadata = URLWebMetadataPydantic( url_id=url_id, accessed=True, - status_code=200, + status_code=self.status_code, content_type=self.content_type, error_message=None ) diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 070c9657..fed9c970 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -1,4 +1,5 @@ from datetime import datetime +from http import HTTPStatus from typing import Optional, Any from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo @@ -357,11 +358,13 @@ async def agency_user_suggestions( async def url_metadata( self, url_ids: list[int], - content_type: str = "text/html" + content_type: str = "text/html", + status_code: int = HTTPStatus.OK.value ) -> None: await self.run_command( URLMetadataCommand( url_ids=url_ids, - content_type=content_type + content_type=content_type, + status_code=status_code ) ) diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 2145bcf1..630d0f71 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -1,6 +1,6 @@ from sqlalchemy import create_engine -from src.db.models.templates import Base +from src.db.models.templates_.base import Base def wipe_database(connection_string: str) -> None: diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index d5f2c313..df455e0e 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -12,3 +12,17 @@ def generate_test_urls(count: int) -> list[str]: results.append(url) return results + +def generate_test_html() -> str: + return """ + + + + Example HTML + + +

Example HTML

+

This is an example of HTML content.

+ + + """ \ No newline at end of file diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index 612e7425..f3050d7b 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_deepseek_record_classifier(): - from src.db.dtos.url.html_content import HTMLContentType as hct + from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", diff --git a/tests/manual/llm_api_logic/test_openai_record_classifier.py b/tests/manual/llm_api_logic/test_openai_record_classifier.py index 7f3cb67e..b0105437 100644 --- a/tests/manual/llm_api_logic/test_openai_record_classifier.py +++ b/tests/manual/llm_api_logic/test_openai_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_openai_record_classifier(): - from src.db.dtos.url.html_content import HTMLContentType as hct + from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", From 7b80acf48deaeb93bec180e10b93c91b4d6c31bc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 3 Aug 2025 08:47:34 -0400 Subject: [PATCH 12/13] Fix broken imports --- .../unit/source_collectors/test_autogoogler_collector.py | 2 +- .../unit/source_collectors/test_common_crawl_collector.py | 2 +- .../unit/source_collectors/test_muckrock_collectors.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index fc7d0bba..20ddc362 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,9 +5,9 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector +from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 66328993..622da31b 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,9 +4,9 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector +from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index 22695f44..a8afe591 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,11 +6,11 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO from src.collectors.source_collectors.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.db.models.instantiations.url.core.pydantic.info import URLInfo PATCH_ROOT = "src.collectors.source_collectors.muckrock" From 284eb661ce00ac2da02b020172cea48f7da827a9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 3 Aug 2025 09:02:17 -0400 Subject: [PATCH 13/13] fix bug when checking for marked as 404 --- src/db/client/async_.py | 2 +- tests/automated/integration/tasks/url/html/check/manager.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 9bc29ed8..25b40852 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -1446,7 +1446,7 @@ async def mark_all_as_duplicates(self, url_ids: List[int]): async def mark_all_as_404(self, url_ids: List[int]): query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.NOT_FOUND.value) await self.execute(query) - query = update(URLWebMetadata).where(URLWebMetadata.id.in_(url_ids)).values(status_code=404) + query = update(URLWebMetadata).where(URLWebMetadata.url_id.in_(url_ids)).values(status_code=404) await self.execute(query) async def mark_all_as_recently_probed_for_404( diff --git a/tests/automated/integration/tasks/url/html/check/manager.py b/tests/automated/integration/tasks/url/html/check/manager.py index accb7409..71a48b42 100644 --- a/tests/automated/integration/tasks/url/html/check/manager.py +++ b/tests/automated/integration/tasks/url/html/check/manager.py @@ -59,7 +59,9 @@ async def _check_has_same_url_status(self): assert url.outcome == entry.url_info.status, f"URL {url.url} has outcome {url.outcome} instead of {entry.url_info.status}" async def _check_marked_as_404(self): - web_metadata_list: list[URLWebMetadata] = await self.adb_client.get_all(URLWebMetadata) + web_metadata_list: list[URLWebMetadata] = await self.adb_client.get_all( + URLWebMetadata + ) for web_metadata in web_metadata_list: entry = self._id_to_entry[web_metadata.url_id] if entry.expected_result.web_metadata_status_marked_404: