From 66d56adc17a7810af9170bb1718c137581a6e800 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 2 Jun 2025 10:03:28 -0400 Subject: [PATCH] Fix bug when parsing dynamic URLs --- pytest.ini | 4 +- src/core/tasks/operators/url_html/core.py | 6 ++- .../scraper/request_interface/core.py | 2 +- tests/automated/integration/api/conftest.py | 27 +---------- tests/automated/integration/api/test_task.py | 2 +- tests/conftest.py | 6 ++- tests/helpers/api_test_helper.py | 29 ++++++++++++ tests/manual/core/tasks/__init__.py | 0 .../core/tasks/test_url_html_task_operator.py | 47 +++++++++++++++++++ .../test_html_tag_collector_integration.py | 10 ++-- 10 files changed, 96 insertions(+), 37 deletions(-) create mode 100644 tests/helpers/api_test_helper.py create mode 100644 tests/manual/core/tasks/__init__.py create mode 100644 tests/manual/core/tasks/test_url_html_task_operator.py diff --git a/pytest.ini b/pytest.ini index 7ffaf369..8f6981ae 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,5 @@ [pytest] timeout = 300 -asyncio_default_fixture_loop_scope=function \ No newline at end of file +asyncio_default_fixture_loop_scope=function +markers = + manual: mark test as manual-only (excluded from default test runs) \ No newline at end of file diff --git a/src/core/tasks/operators/url_html/core.py b/src/core/tasks/operators/url_html/core.py index 9ae4b6fc..c67f2e8c 100644 --- a/src/core/tasks/operators/url_html/core.py +++ b/src/core/tasks/operators/url_html/core.py @@ -37,12 +37,14 @@ async def inner_task_logic(self): await self.get_raw_html_data_for_urls(tdos) success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) non_404_error_subset, is_404_error_subset = await self.separate_error_and_404_subsets(error_subset) + await self.process_html_data(success_subset) + await self.update_database(is_404_error_subset, non_404_error_subset, success_subset) + + async def update_database(self, is_404_error_subset, non_404_error_subset, success_subset): await self.update_errors_in_database(non_404_error_subset) await self.update_404s_in_database(is_404_error_subset) - await self.process_html_data(success_subset) await self.update_html_data_in_database(success_subset) - async def get_just_urls(self, tdos: list[UrlHtmlTDO]): return [task_info.url_info.url for task_info in tdos] diff --git a/src/core/tasks/operators/url_html/scraper/request_interface/core.py b/src/core/tasks/operators/url_html/scraper/request_interface/core.py index 4a222aa3..46d43df2 100644 --- a/src/core/tasks/operators/url_html/scraper/request_interface/core.py +++ b/src/core/tasks/operators/url_html/scraper/request_interface/core.py @@ -37,7 +37,7 @@ async def fetch_and_render(self, rr: RequestResources, url: str) -> Optional[URL if simple_response.content_type != HTML_CONTENT_TYPE: return simple_response - await self.get_dynamic_html_content(rr, url) + return await self.get_dynamic_html_content(rr, url) async def get_dynamic_html_content(self, rr, url): # For HTML responses, attempt to load the page to check for dynamic html content diff --git a/tests/automated/integration/api/conftest.py b/tests/automated/integration/api/conftest.py index aae25b48..ae25b263 100644 --- a/tests/automated/integration/api/conftest.py +++ b/tests/automated/integration/api/conftest.py @@ -1,5 +1,3 @@ -import asyncio -from dataclasses import dataclass from typing import Generator, Any, AsyncGenerator from unittest.mock import AsyncMock @@ -7,37 +5,14 @@ import pytest_asyncio from starlette.testclient import TestClient -from src.api.endpoints.batch.dtos.get.status import GetBatchStatusResponse from src.api.endpoints.review.routes import requires_final_review_permission from src.api.main import app from src.core.core import AsyncCore -from src.core.enums import BatchStatus from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions from tests.automated.integration.api.helpers.RequestValidator import RequestValidator -from tests.helpers.db_data_creator import DBDataCreator - - -@dataclass -class APITestHelper: - request_validator: RequestValidator - async_core: AsyncCore - db_data_creator: DBDataCreator - - def adb_client(self): - return self.db_data_creator.adb_client - - async def wait_for_all_batches_to_complete(self): - for i in range(20): - data: GetBatchStatusResponse = self.request_validator.get_batch_statuses( - status=BatchStatus.IN_PROCESS - ) - if len(data.results) == 0: - return - print("Waiting...") - await asyncio.sleep(0.1) - raise ValueError("Batches did not complete in expected time") +from tests.helpers.api_test_helper import APITestHelper MOCK_USER_ID = 1 diff --git a/tests/automated/integration/api/test_task.py b/tests/automated/integration/api/test_task.py index 21e662f1..e74908e3 100644 --- a/tests/automated/integration/api/test_task.py +++ b/tests/automated/integration/api/test_task.py @@ -1,7 +1,7 @@ import pytest from src.db.enums import TaskType -from tests.automated.integration.api.conftest import APITestHelper +from tests.helpers.api_test_helper import APITestHelper async def task_setup(ath: APITestHelper) -> int: diff --git a/tests/conftest.py b/tests/conftest.py index 3485f3bd..3bdcb79e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +from typing import Any, Generator + import pytest from alembic.config import Config from sqlalchemy import create_engine, inspect, MetaData @@ -97,7 +99,7 @@ def wipe_database(): @pytest.fixture -def db_client_test(wipe_database) -> DatabaseClient: +def db_client_test(wipe_database) -> Generator[DatabaseClient, Any, None]: # Drop pre-existing table conn = get_postgres_connection_string() db_client = DatabaseClient(db_url=conn) @@ -105,7 +107,7 @@ def db_client_test(wipe_database) -> DatabaseClient: db_client.engine.dispose() @pytest.fixture -def adb_client_test(wipe_database) -> AsyncDatabaseClient: +def adb_client_test(wipe_database) -> Generator[AsyncDatabaseClient, Any, None]: conn = get_postgres_connection_string(is_async=True) adb_client = AsyncDatabaseClient(db_url=conn) yield adb_client diff --git a/tests/helpers/api_test_helper.py b/tests/helpers/api_test_helper.py new file mode 100644 index 00000000..fa577b34 --- /dev/null +++ b/tests/helpers/api_test_helper.py @@ -0,0 +1,29 @@ +import asyncio +from dataclasses import dataclass + +from src.api.endpoints.batch.dtos.get.status import GetBatchStatusResponse +from src.core.core import AsyncCore +from src.core.enums import BatchStatus +from tests.automated.integration.api.helpers.RequestValidator import RequestValidator +from tests.helpers.db_data_creator import DBDataCreator + + +@dataclass +class APITestHelper: + request_validator: RequestValidator + async_core: AsyncCore + db_data_creator: DBDataCreator + + def adb_client(self): + return self.db_data_creator.adb_client + + async def wait_for_all_batches_to_complete(self): + for i in range(20): + data: GetBatchStatusResponse = self.request_validator.get_batch_statuses( + status=BatchStatus.IN_PROCESS + ) + if len(data.results) == 0: + return + print("Waiting...") + await asyncio.sleep(0.1) + raise ValueError("Batches did not complete in expected time") diff --git a/tests/manual/core/tasks/__init__.py b/tests/manual/core/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/core/tasks/test_url_html_task_operator.py b/tests/manual/core/tasks/test_url_html_task_operator.py new file mode 100644 index 00000000..b4f343fe --- /dev/null +++ b/tests/manual/core/tasks/test_url_html_task_operator.py @@ -0,0 +1,47 @@ +from unittest.mock import patch + +import pytest + +from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO, ManualBatchInnerInputDTO +from src.core.tasks.operators.url_html.core import URLHTMLTaskOperator +from src.core.tasks.operators.url_html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.operators.url_html.scraper.root_url_cache.core import RootURLCache + + +@pytest.mark.asyncio +@pytest.mark.manual +async def test_url_html_task_operator( + adb_client_test, +): + urls_to_insert = [ + "https://www.albanyca.org/departments/fire-department/programs-classes-events", + "https://www.albanyca.gov/Departments/Police-Department/Crime-Mapping", + "https://www.facebook.com/AlbanyPoliceCa/", + "https://www.governmentjobs.com/careers/albanyca/jobs/3395149/police-officer?pagetype=jobOpportunitiesJobs", + "https://www.albanyca.org/", + "https://www.albanyca.gov/Departments/Police-Department", + "https://www.joinalbanypd.us/", + "https://www.albanyca.gov/Departments/Police-Department/Contact-Albany-Police", + "https://www.albanyca.org/departments/police-department/policies-procedures-training-sb978", + "https://www.yelp.com/biz/albany-police-department-albany-3", + ] + parser = HTMLResponseParser( + root_url_cache=RootURLCache( + adb_client=adb_client_test + ) + ) + manual_batch_dto = ManualBatchInputDTO( + name="Test Batch", + entries=[ + ManualBatchInnerInputDTO(url=url) for url in urls_to_insert + ] + ) + await adb_client_test.upload_manual_batch(dto=manual_batch_dto, user_id=1) + operator = URLHTMLTaskOperator( + adb_client=adb_client_test, + url_request_interface=URLRequestInterface(), + html_parser=parser + ) + run_info = await operator.run_task(1) + pass \ No newline at end of file diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index 45b1daf9..4a40b84e 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -1,9 +1,12 @@ import pytest -from src.core.tasks.operators.url_html import URLHTMLTaskOperator +from src.core.tasks.operators.url_html.core import URLHTMLTaskOperator +from src.core.tasks.operators.url_html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url_info import URLInfo from tests.helpers.db_data_creator import DBDataCreator -from src.core.tasks.operators.url_html.scraper import HTMLResponseParser -from src.core.tasks.operators.url_html.scraper.request_interface import URLRequestInterface URLS = [ "https://pdap.io", @@ -71,7 +74,6 @@ async def test_url_html_cycle( url_infos.append(URLInfo(url=url)) await adb_client.insert_urls(url_infos=url_infos, batch_id=batch_id) - operator = URLHTMLTaskOperator( adb_client=adb_client, url_request_interface=URLRequestInterface(),