diff --git a/alembic/env.py b/alembic/env.py index 2cf7e6c8..ff14698b 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,4 +1,3 @@ -import logging from datetime import datetime from logging.config import fileConfig @@ -7,7 +6,7 @@ from sqlalchemy import pool from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.templates import Base +from src.db.models.templates_.base import Base # this is the Alembic Config object, which provides # access to the values within the .ini file in use. diff --git a/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py new file mode 100644 index 00000000..891bef3a --- /dev/null +++ b/alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py @@ -0,0 +1,156 @@ +"""Add HTML Status Info table + +Revision ID: 99eceed6e614 +Revises: 637de6eaa3ab +Create Date: 2025-07-31 15:36:40.966605 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, created_at_column, updated_at_column, url_id_column, switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '99eceed6e614' +down_revision: Union[str, None] = '637de6eaa3ab' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +WEB_STATUS_ENUM = sa.Enum( + "not_attempted", + "success", + "error", + "404_not_found", + name="web_status" +) +SCRAPE_STATUS_ENUM = sa.Enum( + "success", + "error", + name="scrape_status", +) + +URL_WEB_METADATA_TABLE_NAME = 'url_web_metadata' +URL_SCRAPE_INFO = 'url_scrape_info' + + + + + +def upgrade() -> None: + _create_url_html_info_table() + _add_url_probe_task_type_enum() + _set_up_scrape_info_table() + _use_existing_html_data_to_add_scrape_info() + +def _use_existing_html_data_to_add_scrape_info(): + op.execute( + f""" + INSERT INTO {URL_SCRAPE_INFO} (url_id, status) + SELECT url_id, 'success'::scrape_status + FROM url_compressed_html + """ + ) + op.execute( + f""" + INSERT INTO {URL_SCRAPE_INFO} (url_id, status) + SELECT distinct(url_id), 'success'::scrape_status + FROM url_html_content + LEFT JOIN URL_COMPRESSED_HTML USING (url_id) + WHERE URL_COMPRESSED_HTML.url_id IS NULL + """ + ) + +def downgrade() -> None: + _drop_scrape_info_table() + # Drop Enums + WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) + _drop_url_probe_task_type_enum() + _tear_down_scrape_info_table() + + +def _set_up_scrape_info_table(): + op.create_table( + URL_SCRAPE_INFO, + id_column(), + url_id_column(), + sa.Column( + 'status', + SCRAPE_STATUS_ENUM, + nullable=False, + comment='The status of the most recent scrape attempt.' + ), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_url_scrape_info_url_id') + ) + + + + +def _tear_down_scrape_info_table(): + op.drop_table(URL_SCRAPE_INFO) + # Drop enum + SCRAPE_STATUS_ENUM.drop(op.get_bind(), checkfirst=True) + + +def _add_url_probe_task_type_enum() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe' + ] + ) + +def _drop_url_probe_task_type_enum() -> None: + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face' + ] + ) + +def _create_url_html_info_table() -> None: + op.create_table( + URL_WEB_METADATA_TABLE_NAME, + id_column(), + url_id_column(), + sa.Column('accessed', sa.Boolean(), nullable=False), + sa.Column('status_code', sa.Integer(), nullable=True), + sa.Column('content_type', sa.Text(), nullable=True), + sa.Column('error_message', sa.Text(), nullable=True), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_url_web_status_info_url_id'), + sa.CheckConstraint('status_code >= 100', name='ck_url_web_status_info_status_code_min'), + sa.CheckConstraint('status_code <= 999', name='ck_url_web_status_info_status_code_max'), + ) + +def _drop_scrape_info_table() -> None: + op.drop_table(URL_WEB_METADATA_TABLE_NAME) diff --git a/local_database/classes/DockerClient.py b/local_database/classes/DockerClient.py index ca9d535b..5c33e7d9 100644 --- a/local_database/classes/DockerClient.py +++ b/local_database/classes/DockerClient.py @@ -1,5 +1,7 @@ import docker from docker.errors import NotFound, APIError +from docker.models.containers import Container +from docker.models.networks import Network from local_database.DTOs import DockerfileInfo, DockerInfo @@ -9,7 +11,7 @@ class DockerClient: def __init__(self): self.client = docker.from_env() - def run_command(self, command: str, container_id: str): + def run_command(self, command: str, container_id: str) -> None: exec_id = self.client.api.exec_create( container_id, cmd=command, @@ -20,7 +22,7 @@ def run_command(self, command: str, container_id: str): for line in output_stream: print(line.decode().rstrip()) - def start_network(self, network_name): + def start_network(self, network_name) -> Network: try: self.client.networks.create(network_name, driver="bridge") except APIError as e: @@ -30,14 +32,14 @@ def start_network(self, network_name): print("Network already exists") return self.client.networks.get(network_name) - def stop_network(self, network_name): + def stop_network(self, network_name) -> None: self.client.networks.get(network_name).remove() def get_image( self, dockerfile_info: DockerfileInfo, force_rebuild: bool = False - ): + ) -> None: if dockerfile_info.dockerfile_directory: # Build image from Dockerfile self.client.images.build( @@ -58,7 +60,7 @@ def get_image( except NotFound: self.client.images.pull(dockerfile_info.image_tag) - def get_existing_container(self, docker_info_name: str): + def get_existing_container(self, docker_info_name: str) -> Container | None: try: return self.client.containers.get(docker_info_name) except NotFound: diff --git a/local_database/classes/DockerContainer.py b/local_database/classes/DockerContainer.py index 33b71ce0..0a86e601 100644 --- a/local_database/classes/DockerContainer.py +++ b/local_database/classes/DockerContainer.py @@ -11,19 +11,19 @@ def __init__(self, dc: DockerClient, container: Container): self.dc = dc self.container = container - def run_command(self, command: str): + def run_command(self, command: str) -> None: self.dc.run_command(command, self.container.id) - def stop(self): + def stop(self) -> None: self.container.stop() - def log_to_file(self): + def log_to_file(self) -> None: logs = self.container.logs(stdout=True, stderr=True) container_name = self.container.name with open(f"{container_name}.log", "wb") as f: f.write(logs) - def wait_for_pg_to_be_ready(self): + def wait_for_pg_to_be_ready(self) -> None: for i in range(30): exit_code, output = self.container.exec_run("pg_isready") print(output) diff --git a/local_database/classes/DockerManager.py b/local_database/classes/DockerManager.py index ac294dc1..fc32c3bc 100644 --- a/local_database/classes/DockerManager.py +++ b/local_database/classes/DockerManager.py @@ -4,6 +4,8 @@ import docker from docker.errors import APIError +from docker.models.containers import Container +from docker.models.networks import Network from local_database.DTOs import DockerfileInfo, DockerInfo from local_database.classes.DockerClient import DockerClient @@ -20,7 +22,7 @@ def __init__(self): self.network = self.start_network() @staticmethod - def start_docker_engine(): + def start_docker_engine() -> None: system = platform.system() match system: @@ -41,7 +43,7 @@ def start_docker_engine(): sys.exit(1) @staticmethod - def is_docker_running(): + def is_docker_running() -> bool: try: client = docker.from_env() client.ping() @@ -50,16 +52,23 @@ def is_docker_running(): print(f"Docker is not running: {e}") return False - def run_command(self, command: str, container_id: str): + def run_command( + self, + command: str, + container_id: str + ) -> None: self.client.run_command(command, container_id) - def start_network(self): + def start_network(self) -> Network: return self.client.start_network(self.network_name) - def stop_network(self): + def stop_network(self) -> None: self.client.stop_network(self.network_name) - def get_image(self, dockerfile_info: DockerfileInfo): + def get_image( + self, + dockerfile_info: DockerfileInfo + ) -> None: self.client.get_image(dockerfile_info) def run_container( @@ -74,5 +83,5 @@ def run_container( ) return DockerContainer(self.client, raw_container) - def get_containers(self): + def get_containers(self) -> list[Container]: return self.client.client.containers.list() \ No newline at end of file diff --git a/local_database/classes/TimestampChecker.py b/local_database/classes/TimestampChecker.py index 56779fd4..fc2c25a0 100644 --- a/local_database/classes/TimestampChecker.py +++ b/local_database/classes/TimestampChecker.py @@ -1,27 +1,26 @@ -import datetime import os -from typing import Optional +from datetime import datetime, timedelta class TimestampChecker: def __init__(self): - self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time() + self.last_run_time: datetime | None = self.load_last_run_time() - def load_last_run_time(self) -> Optional[datetime.datetime]: + def load_last_run_time(self) -> datetime | None: # Check if file `last_run.txt` exists # If it does, load the last run time if os.path.exists("local_state/last_run.txt"): with open("local_state/last_run.txt", "r") as f: - return datetime.datetime.strptime( + return datetime.strptime( f.read(), "%Y-%m-%d %H:%M:%S" ) return None - def last_run_within_24_hours(self): + def last_run_within_24_hours(self) -> bool: if self.last_run_time is None: return False - return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1) + return datetime.now() - self.last_run_time < timedelta(days=1) def set_last_run_time(self): # If directory `local_state` doesn't exist, create it @@ -29,4 +28,4 @@ def set_last_run_time(self): os.makedirs("local_state") with open("local_state/last_run.txt", "w") as f: - f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) diff --git a/local_database/create_database.py b/local_database/create_database.py index 67eae70b..e18cbd2a 100644 --- a/local_database/create_database.py +++ b/local_database/create_database.py @@ -15,7 +15,7 @@ # Connect to the default 'postgres' database to create other databases -def connect(database="postgres", autocommit=True): +def connect(database="postgres", autocommit=True) -> psycopg2.extensions.connection: conn = psycopg2.connect( dbname=database, user=POSTGRES_USER, @@ -27,7 +27,7 @@ def connect(database="postgres", autocommit=True): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) return conn -def create_database(db_name): +def create_database(db_name: str) -> None: conn = connect("postgres") with conn.cursor() as cur: cur.execute(sql.SQL(""" @@ -48,7 +48,7 @@ def create_database(db_name): except Exception as e: print(f"❌ Failed to create {db_name}: {e}") -def main(): +def main() -> None: print("Creating databases...") create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME) diff --git a/local_database/setup.py b/local_database/setup.py index 99ff1da9..64f5af48 100644 --- a/local_database/setup.py +++ b/local_database/setup.py @@ -7,14 +7,19 @@ MAX_RETRIES = 20 SLEEP_SECONDS = 1 -def run_command(cmd, check=True, capture_output=False, **kwargs): +def run_command( + cmd: str, + check: bool = True, + capture_output: bool = False, + **kwargs: dict +) -> subprocess.CompletedProcess: try: return subprocess.run(cmd, shell=True, check=check, capture_output=capture_output, text=True, **kwargs) except subprocess.CalledProcessError as e: print(f"Command '{cmd}' failed: {e}") sys.exit(1) -def get_postgres_container_id(): +def get_postgres_container_id() -> str: result = run_command(f"docker-compose ps -q {POSTGRES_SERVICE_NAME}", capture_output=True) container_id = result.stdout.strip() if not container_id: @@ -22,7 +27,7 @@ def get_postgres_container_id(): sys.exit(1) return container_id -def wait_for_postgres(container_id): +def wait_for_postgres(container_id: str) -> None: print("Waiting for Postgres to be ready...") for i in range(MAX_RETRIES): try: @@ -36,7 +41,7 @@ def wait_for_postgres(container_id): print("Postgres did not become ready in time.") sys.exit(1) -def main(): +def main() -> None: print("Stopping Docker Compose...") run_command("docker-compose down") diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 27f7a382..66a5e3fb 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -7,7 +7,7 @@ from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus -from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info +from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index a7e30385..1e9fc5fa 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.mapping import URLMapping diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 13e8659c..90f9b209 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 49b95e13..980b4c81 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,8 +1,8 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/review/next/dto.py b/src/api/endpoints/review/next/dto.py index 7fc53b17..a9c378b9 100644 --- a/src/api/endpoints/review/next/dto.py +++ b/src/api/endpoints/review/next/dto.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.core.enums import RecordType, SuggestedStatus -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo class FinalReviewAnnotationRelevantInfo(BaseModel): diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index 0ec83dc1..d89aa4da 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -7,7 +7,7 @@ from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.url_html.scraper.parser.util import convert_to_response_html_info +from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index 9213aa90..eba6cece 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -3,8 +3,8 @@ from pydantic import BaseModel +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.enums import TaskType from src.core.enums import BatchStatus diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index 8133085f..c2b32234 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,7 +5,7 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.db.models.instantiations.task.core import Task diff --git a/src/api/main.py b/src/api/main.py index 46ae4a3a..e9916724 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -26,11 +26,11 @@ from src.core.tasks.scheduled.manager import AsyncScheduledTaskManager from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.external.huggingface.hub.client import HuggingFaceHubClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index 460cf0e0..b41eba76 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class AutoGooglerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index beb31cb7..2f777d5f 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index b72ee3c9..0b1cef2e 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 16f5d730..d2f0d988 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class CommonCrawlerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index 691d23c6..580b739e 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,8 +1,8 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class ExamplePreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index b42a198f..b0f1d9bc 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,7 +1,7 @@ from typing import List -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class MuckrockPreprocessor(PreprocessorBase): diff --git a/src/core/tasks/scheduled/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/huggingface/queries/check/requester.py index 6af94560..33a79043 100644 --- a/src/core/tasks/scheduled/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/huggingface/queries/check/requester.py @@ -7,7 +7,7 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh from src.db.models.instantiations.state.huggingface import HuggingFaceUploadState -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/src/core/tasks/scheduled/huggingface/queries/get/core.py b/src/core/tasks/scheduled/huggingface/queries/get/core.py index 7deea322..906f4d4f 100644 --- a/src/core/tasks/scheduled/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/huggingface/queries/get/core.py @@ -1,5 +1,3 @@ -from typing import Any - from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -7,7 +5,7 @@ from src.core.tasks.scheduled.huggingface.queries.get.convert import convert_url_status_to_relevant, \ convert_fine_to_coarse_record_type from src.core.tasks.scheduled.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 50ff8920..59896f94 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,15 +7,16 @@ from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier -from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator -from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator -from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient @@ -101,8 +102,16 @@ async def get_url_auto_relevance_task_operator(self): ) return operator + async def get_url_probe_task_operator(self): + operator = URLProbeTaskOperator( + adb_client=self.adb_client, + url_request_interface=self.url_request_interface + ) + return operator + async def get_task_operators(self) -> list[URLTaskOperatorBase]: return [ + await self.get_url_probe_task_operator(), await self.get_url_html_task_operator(), await self.get_url_duplicate_task_operator(), await self.get_url_404_probe_task_operator(), diff --git a/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py b/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py index f42ecfc2..39f2cab3 100644 --- a/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py +++ b/src/core/tasks/url/operators/agency_identification/dtos/suggestion.py @@ -8,9 +8,9 @@ class URLAgencySuggestionInfo(BaseModel): url_id: int suggestion_type: SuggestionType = SuggestionType.UNKNOWN - pdap_agency_id: Optional[int] = None - agency_name: Optional[str] = None - state: Optional[str] = None - county: Optional[str] = None - locality: Optional[str] = None - user_id: Optional[int] = None + pdap_agency_id: int | None = None + agency_name: str | None = None + state: str | None = None + county: str | None = None + locality: str | None = None + user_id: int | None = None diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index 78e4c983..2ec72836 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -6,7 +6,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/submit_approved_url/__init__.py b/src/core/tasks/url/operators/duplicate/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/__init__.py rename to src/core/tasks/url/operators/duplicate/__init__.py diff --git a/src/core/tasks/url/operators/url_duplicate/core.py b/src/core/tasks/url/operators/duplicate/core.py similarity index 95% rename from src/core/tasks/url/operators/url_duplicate/core.py rename to src/core/tasks/url/operators/duplicate/core.py index ed3d00a5..dba0147c 100644 --- a/src/core/tasks/url/operators/url_duplicate/core.py +++ b/src/core/tasks/url/operators/duplicate/core.py @@ -4,7 +4,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.core.tasks.url.operators.url_duplicate.tdo import URLDuplicateTDO +from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/operators/url_duplicate/tdo.py b/src/core/tasks/url/operators/duplicate/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_duplicate/tdo.py rename to src/core/tasks/url/operators/duplicate/tdo.py diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/__init__.py b/src/core/tasks/url/operators/html/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/queries/__init__.py rename to src/core/tasks/url/operators/html/__init__.py diff --git a/src/core/tasks/url/operators/url_html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py similarity index 78% rename from src/core/tasks/url/operators/url_html/content_info_getter.py rename to src/core/tasks/url/operators/html/content_info_getter.py index 644e12e4..fb7bdd59 100644 --- a/src/core/tasks/url/operators/url_html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,5 +1,6 @@ -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType class HTMLContentInfoGetter: diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py new file mode 100644 index 00000000..00c1d1c3 --- /dev/null +++ b/src/core/tasks/url/operators/html/core.py @@ -0,0 +1,84 @@ +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.html.filter import filter_just_urls, filter_404_subset +from src.core.tasks.url.operators.html.queries.insert.query import InsertURLHTMLInfoQueryBuilder +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.external.url_request.core import URLRequestInterface + + +class URLHTMLTaskOperator(URLTaskOperatorBase): + + def __init__( + self, + url_request_interface: URLRequestInterface, + adb_client: AsyncDatabaseClient, + html_parser: HTMLResponseParser + ): + super().__init__(adb_client) + self.url_request_interface = url_request_interface + self.html_parser = html_parser + + @property + def task_type(self) -> TaskType: + return TaskType.HTML + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.has_non_errored_urls_without_html_data() + + async def inner_task_logic(self) -> None: + tdos = await self._get_non_errored_urls_without_html_data() + url_ids = [task_info.url_info.id for task_info in tdos] + await self.link_urls_to_task(url_ids=url_ids) + + await self._get_raw_html_data_for_urls(tdos) + await self._process_html_data(tdos) + + tdos_404 = await filter_404_subset(tdos) + await self._update_404s_in_database(tdos_404) + await self._update_html_data_in_database(tdos) + + + async def _get_non_errored_urls_without_html_data(self) -> list[UrlHtmlTDO]: + pending_urls: list[URLInfo] = await self.adb_client.get_non_errored_urls_without_html_data() + tdos = [ + UrlHtmlTDO( + url_info=url_info, + ) for url_info in pending_urls + ] + return tdos + + async def _get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]) -> None: + just_urls = await filter_just_urls(tdos) + url_response_infos = await self.url_request_interface.make_requests_with_html(just_urls) + for tdto, url_response_info in zip(tdos, url_response_infos): + tdto.url_response_info = url_response_info + + async def _update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]) -> None: + url_ids = [tdo.url_info.id for tdo in tdos_404] + await self.adb_client.mark_all_as_404(url_ids) + + + async def _process_html_data(self, tdos: list[UrlHtmlTDO]) -> None: + """ + Modifies: + tdto.html_tag_info + """ + for tdto in tdos: + if not tdto.url_response_info.success: + continue + html_tag_info = await self.html_parser.parse( + url=tdto.url_info.url, + html_content=tdto.url_response_info.html, + content_type=tdto.url_response_info.content_type + ) + tdto.html_tag_info = html_tag_info + + async def _update_html_data_in_database(self, tdos: list[UrlHtmlTDO]) -> None: + await self.adb_client.run_query_builder( + InsertURLHTMLInfoQueryBuilder(tdos, task_id=self.task_id) + ) + + diff --git a/src/core/tasks/url/operators/html/filter.py b/src/core/tasks/url/operators/html/filter.py new file mode 100644 index 00000000..86da0e8a --- /dev/null +++ b/src/core/tasks/url/operators/html/filter.py @@ -0,0 +1,13 @@ +from http import HTTPStatus + +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +async def filter_just_urls(tdos: list[UrlHtmlTDO]): + return [task_info.url_info.url for task_info in tdos] + +async def filter_404_subset(tdos: list[UrlHtmlTDO]) -> list[UrlHtmlTDO]: + return [ + tdo for tdo in tdos + if tdo.url_response_info.status == HTTPStatus.NOT_FOUND + ] diff --git a/src/core/tasks/url/operators/url_404_probe/__init__.py b/src/core/tasks/url/operators/html/models/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_404_probe/__init__.py rename to src/core/tasks/url/operators/html/models/__init__.py diff --git a/src/core/tasks/url/operators/url_duplicate/__init__.py b/src/core/tasks/url/operators/html/models/subsets/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_duplicate/__init__.py rename to src/core/tasks/url/operators/html/models/subsets/__init__.py diff --git a/src/core/tasks/url/operators/html/models/subsets/error_404.py b/src/core/tasks/url/operators/html/models/subsets/error_404.py new file mode 100644 index 00000000..f526368c --- /dev/null +++ b/src/core/tasks/url/operators/html/models/subsets/error_404.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +class ErrorSubsets(BaseModel): + is_404: list[UrlHtmlTDO] + not_404: list[UrlHtmlTDO] \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/models/subsets/success_error.py b/src/core/tasks/url/operators/html/models/subsets/success_error.py new file mode 100644 index 00000000..75429a6e --- /dev/null +++ b/src/core/tasks/url/operators/html/models/subsets/success_error.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO + + +class SuccessErrorSubset(BaseModel): + success: list[UrlHtmlTDO] + error: list[UrlHtmlTDO] \ No newline at end of file diff --git a/src/core/tasks/url/operators/url_html/__init__.py b/src/core/tasks/url/operators/html/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/__init__.py rename to src/core/tasks/url/operators/html/queries/__init__.py diff --git a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py b/src/core/tasks/url/operators/html/queries/get.py similarity index 87% rename from src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py rename to src/core/tasks/url/operators/html/queries/get.py index ff7f7c10..d09f8bca 100644 --- a/src/core/tasks/url/operators/url_html/queries/get_pending_urls_without_html_data.py +++ b/src/core/tasks/url/operators/html/queries/get.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -9,7 +9,7 @@ class GetPendingURLsWithoutHTMLDataQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[URLInfo]: - statement = StatementComposer.pending_urls_without_html_data() + statement = StatementComposer.has_non_errored_urls_without_html_data() statement = statement.limit(100).order_by(URL.id) scalar_result = await session.scalars(statement) url_results: list[URL] = scalar_result.all() diff --git a/src/core/tasks/url/operators/url_html/queries/__init__.py b/src/core/tasks/url/operators/html/queries/insert/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/queries/__init__.py rename to src/core/tasks/url/operators/html/queries/insert/__init__.py diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py new file mode 100644 index 00000000..9c9906d8 --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -0,0 +1,73 @@ +from http import HTTPStatus + +from src.core.tasks.url.operators.html.content_info_getter import HTMLContentInfoGetter +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.instantiations.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel +from src.db.utils.compression import compress_html +from src.external.url_request.dtos.url_response import URLResponseInfo + + +def convert_to_compressed_html(tdos: list[UrlHtmlTDO]) -> list[URLCompressedHTMLPydantic]: + models = [] + for tdo in tdos: + if tdo.url_response_info.status != HTTPStatus.OK: + continue + model = URLCompressedHTMLPydantic( + url_id=tdo.url_info.id, + compressed_html=compress_html(tdo.url_response_info.html) + ) + models.append(model) + return models + + + +def _convert_to_html_content_info_getter(tdo: UrlHtmlTDO) -> HTMLContentInfoGetter: + return HTMLContentInfoGetter( + response_html_info=tdo.html_tag_info, + url_id=tdo.url_info.id + ) + +def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]: + html_content_infos = [] + for tdo in tdos: + if tdo.url_response_info.status != HTTPStatus.OK: + continue + hcig = _convert_to_html_content_info_getter(tdo) + results = hcig.get_all_html_content() + html_content_infos.extend(results) + return html_content_infos + +def get_scrape_status(response_info: URLResponseInfo) -> ScrapeStatus: + if response_info.success: + return ScrapeStatus.SUCCESS + return ScrapeStatus.ERROR + +def convert_to_scrape_infos(tdos: list[UrlHtmlTDO]) -> list[URLScrapeInfoInsertModel]: + models = [] + for tdo in tdos: + model = URLScrapeInfoInsertModel( + url_id=tdo.url_info.id, + status=get_scrape_status(tdo.url_response_info) + ) + models.append(model) + return models + +def convert_to_url_errors( + tdos: list[UrlHtmlTDO], + task_id: int +) -> list[URLErrorPydanticInfo]: + models = [] + for tdo in tdos: + if tdo.url_response_info.success: + continue + model = URLErrorPydanticInfo( + url_id=tdo.url_info.id, + error=tdo.url_response_info.exception, + task_id=task_id + ) + models.append(model) + return models \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/queries/insert/query.py b/src/core/tasks/url/operators/html/queries/insert/query.py new file mode 100644 index 00000000..e0bff2e6 --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/insert/query.py @@ -0,0 +1,30 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \ + convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors +from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class InsertURLHTMLInfoQueryBuilder(QueryBuilderBase): + + def __init__(self, tdos: list[UrlHtmlTDO], task_id: int): + super().__init__() + self.tdos = tdos + self.task_id = task_id + + async def run(self, session: AsyncSession) -> None: + compressed_html_models = convert_to_compressed_html(self.tdos) + url_html_content_list = convert_to_html_content_info_list(self.tdos) + scrape_info_list = convert_to_scrape_infos(self.tdos) + url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id) + + for models in [ + compressed_html_models, + url_html_content_list, + scrape_info_list, + url_errors + ]: + await sh.bulk_insert(session, models=models) + + diff --git a/src/core/tasks/url/operators/url_html/scraper/README.md b/src/core/tasks/url/operators/html/scraper/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/README.md rename to src/core/tasks/url/operators/html/scraper/README.md diff --git a/src/core/tasks/url/operators/url_html/scraper/__init__.py b/src/core/tasks/url/operators/html/scraper/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/__init__.py rename to src/core/tasks/url/operators/html/scraper/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/README.md b/src/core/tasks/url/operators/html/scraper/parser/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/README.md rename to src/core/tasks/url/operators/html/scraper/parser/README.md diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/__init__.py b/src/core/tasks/url/operators/html/scraper/parser/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/__init__.py rename to src/core/tasks/url/operators/html/scraper/parser/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/constants.py b/src/core/tasks/url/operators/html/scraper/parser/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/constants.py rename to src/core/tasks/url/operators/html/scraper/parser/constants.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/core.py b/src/core/tasks/url/operators/html/scraper/parser/core.py similarity index 89% rename from src/core/tasks/url/operators/url_html/scraper/parser/core.py rename to src/core/tasks/url/operators/html/scraper/parser/core.py index 737f03dd..a212b951 100644 --- a/src/core/tasks/url/operators/url_html/scraper/parser/core.py +++ b/src/core/tasks/url/operators/html/scraper/parser/core.py @@ -3,11 +3,11 @@ from bs4 import BeautifulSoup -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.parser.enums import ParserTypeEnum -from src.core.tasks.url.operators.url_html.scraper.parser.constants import HEADER_TAGS -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.url_html.scraper.parser.util import remove_excess_whitespace, add_https, remove_trailing_backslash, \ +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.enums import ParserTypeEnum +from src.core.tasks.url.operators.html.scraper.parser.constants import HEADER_TAGS +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.parser.util import remove_excess_whitespace, add_https, remove_trailing_backslash, \ drop_hostname diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/dtos/__init__.py b/src/core/tasks/url/operators/html/scraper/parser/dtos/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/dtos/__init__.py rename to src/core/tasks/url/operators/html/scraper/parser/dtos/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/dtos/response_html.py b/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/dtos/response_html.py rename to src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/enums.py b/src/core/tasks/url/operators/html/scraper/parser/enums.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/parser/enums.py rename to src/core/tasks/url/operators/html/scraper/parser/enums.py diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/mapping.py b/src/core/tasks/url/operators/html/scraper/parser/mapping.py similarity index 80% rename from src/core/tasks/url/operators/url_html/scraper/parser/mapping.py rename to src/core/tasks/url/operators/html/scraper/parser/mapping.py index 6b5f0b83..641af779 100644 --- a/src/core/tasks/url/operators/url_html/scraper/parser/mapping.py +++ b/src/core/tasks/url/operators/html/scraper/parser/mapping.py @@ -1,4 +1,4 @@ -from src.db.dtos.url.html_content import HTMLContentType +from src.db.models.instantiations.url.html.content.enums import HTMLContentType ENUM_TO_ATTRIBUTE_MAPPING = { HTMLContentType.TITLE: "title", diff --git a/src/core/tasks/url/operators/url_html/scraper/parser/util.py b/src/core/tasks/url/operators/html/scraper/parser/util.py similarity index 84% rename from src/core/tasks/url/operators/url_html/scraper/parser/util.py rename to src/core/tasks/url/operators/html/scraper/parser/util.py index 09453984..a4ea2d1b 100644 --- a/src/core/tasks/url/operators/url_html/scraper/parser/util.py +++ b/src/core/tasks/url/operators/html/scraper/parser/util.py @@ -1,8 +1,8 @@ from urllib.parse import urlparse from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo def convert_to_response_html_info(html_content_infos: list[URLHTMLContentInfo]): diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/__init__.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/__init__.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/constants.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/constants.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py similarity index 92% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/core.py index c30bc16e..284ad678 100644 --- a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/core.py +++ b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py @@ -5,8 +5,8 @@ from bs4 import BeautifulSoup from src.db.client.async_ import AsyncDatabaseClient -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.constants import REQUEST_HEADERS -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo +from src.core.tasks.url.operators.html.scraper.root_url_cache.constants import REQUEST_HEADERS +from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo DEBUG = False diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/__init__.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/__init__.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/response.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/response.py rename to src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py diff --git a/src/core/tasks/url/operators/html/tdo.py b/src/core/tasks/url/operators/html/tdo.py new file mode 100644 index 00000000..6395e363 --- /dev/null +++ b/src/core/tasks/url/operators/html/tdo.py @@ -0,0 +1,14 @@ +from typing import Optional + +from pydantic import BaseModel + +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.external.url_request.dtos.url_response import URLResponseInfo + + +class UrlHtmlTDO(BaseModel): + url_info: URLInfo + url_response_info: URLResponseInfo | None = None + html_tag_info: ResponseHTMLInfo | None = None + diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/__init__.py b/src/core/tasks/url/operators/misc_metadata/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/__init__.py rename to src/core/tasks/url/operators/misc_metadata/__init__.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py b/src/core/tasks/url/operators/misc_metadata/core.py similarity index 96% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/core.py rename to src/core/tasks/url/operators/misc_metadata/core.py index 446c32c4..9921846b 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/core.py +++ b/src/core/tasks/url/operators/misc_metadata/core.py @@ -4,7 +4,7 @@ from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.subtasks.miscellaneous_metadata.auto_googler import AutoGooglerMiscMetadataSubtask from src.core.tasks.url.subtasks.miscellaneous_metadata.ckan import CKANMiscMetadataSubtask diff --git a/src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/__init__.py b/src/core/tasks/url/operators/misc_metadata/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/root_url_cache/dtos/__init__.py rename to src/core/tasks/url/operators/misc_metadata/queries/__init__.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py similarity index 89% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py rename to src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index e5add9ce..ed411bd6 100644 --- a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -2,8 +2,8 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import CollectorType -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo -from src.db.dtos.url.html_content import HTMLContentType +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/has_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/has_pending_urls_missing_miscellaneous_data.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/has_pending_urls_missing_miscellaneous_data.py rename to src/core/tasks/url/operators/misc_metadata/queries/has_pending_urls_missing_miscellaneous_data.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/tdo.py b/src/core/tasks/url/operators/misc_metadata/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/tdo.py rename to src/core/tasks/url/operators/misc_metadata/tdo.py diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/__init__.py b/src/core/tasks/url/operators/probe/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/__init__.py rename to src/core/tasks/url/operators/probe/__init__.py diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py new file mode 100644 index 00000000..98d4f8ab --- /dev/null +++ b/src/core/tasks/url/operators/probe/core.py @@ -0,0 +1,77 @@ +from typing import final +from typing_extensions import override + +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from src.external.url_request.core import URLRequestInterface +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.enums import TaskType + +@final +class URLProbeTaskOperator(URLTaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + url_request_interface: URLRequestInterface + ): + super().__init__(adb_client=adb_client) + self.url_request_interface = url_request_interface + + + @property + @override + def task_type(self) -> TaskType: + return TaskType.PROBE_URL + + @override + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.has_urls_without_probe() + + async def get_urls_without_probe(self) -> list[URLProbeTDO]: + url_mappings: list[URLMapping] = await self.adb_client.get_urls_without_probe() + return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] + + @override + async def inner_task_logic(self) -> None: + tdos = await self.get_urls_without_probe() + await self.link_urls_to_task( + url_ids=[tdo.url_mapping.url_id for tdo in tdos] + ) + await self.probe_urls(tdos) + await self.update_database(tdos) + + async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: + """Probe URLs and add responses to URLProbeTDO + + Modifies: + URLProbeTDO.response + """ + url_to_tdo: dict[str, URLProbeTDO] = { + tdo.url_mapping.url: tdo for tdo in tdos + } + responses = await self.url_request_interface.probe_urls( + urls=[tdo.url_mapping.url for tdo in tdos] + ) + # Re-associate the responses with the URL mappings + for response in responses: + tdo = url_to_tdo[response.url] + tdo.response = response + + async def update_database(self, tdos: list[URLProbeTDO]) -> None: + web_metadata_objects: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response = tdo.response + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=response.status_code is not None, + status_code=response.status_code, + content_type=response.content_type, + error_message=response.error + ) + web_metadata_objects.append(web_metadata_object) + await self.adb_client.bulk_insert(web_metadata_objects) + + diff --git a/src/core/tasks/url/operators/url_miscellaneous_metadata/queries/__init__.py b/src/core/tasks/url/operators/probe/queries/__init__.py similarity index 100% rename from src/core/tasks/url/operators/url_miscellaneous_metadata/queries/__init__.py rename to src/core/tasks/url/operators/probe/queries/__init__.py diff --git a/src/core/tasks/url/operators/probe/queries/get_urls.py b/src/core/tasks/url/operators/probe/queries/get_urls.py new file mode 100644 index 00000000..9df9191f --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/get_urls.py @@ -0,0 +1,31 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.helpers.session import session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +@final +class GetURLsWithoutProbeQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + select( + URL.id.label("url_id"), + URL.url + ) + .outerjoin( + URLWebMetadata, + URL.id == URLWebMetadata.url_id + ) + .where( + URLWebMetadata.id.is_(None) + ) + ) + db_mappings = await sh.mappings(session, query=query) + return [URLMapping(**mapping) for mapping in db_mappings] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/has_urls.py b/src/core/tasks/url/operators/probe/queries/has_urls.py new file mode 100644 index 00000000..1ae7835b --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/has_urls.py @@ -0,0 +1,27 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.base.builder import QueryBuilderBase + +@final +class HasURLsWithoutProbeQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> bool: + query = ( + select( + URL.id + ) + .outerjoin( + URLWebMetadata, + URL.id == URLWebMetadata.url_id + ) + .where( + URLWebMetadata.id.is_(None) + ) + ) + return await sh.has_results(session, query=query) diff --git a/src/core/tasks/url/operators/probe/queries/insert.py b/src/core/tasks/url/operators/probe/queries/insert.py new file mode 100644 index 00000000..2b312e36 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert.py @@ -0,0 +1,15 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override, final + +from src.db.queries.base.builder import QueryBuilderBase + +@final +class InsertURLMetadataInfoQueryBuilder(QueryBuilderBase): + + def __init__( + self, + + ): + + @override + async def run(self, session: AsyncSession) -> None: diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py new file mode 100644 index 00000000..8af513c1 --- /dev/null +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.external.url_request.probe.model import URLProbeResponse +from src.db.dtos.url.mapping import URLMapping + + +class URLProbeTDO(BaseModel): + url_mapping: URLMapping + response: URLProbeResponse | None = None diff --git a/api/main.py b/src/core/tasks/url/operators/probe_404/__init__.py similarity index 100% rename from api/main.py rename to src/core/tasks/url/operators/probe_404/__init__.py diff --git a/src/core/tasks/url/operators/url_404_probe/core.py b/src/core/tasks/url/operators/probe_404/core.py similarity index 92% rename from src/core/tasks/url/operators/url_404_probe/core.py rename to src/core/tasks/url/operators/probe_404/core.py index 7da96068..6600d17d 100644 --- a/src/core/tasks/url/operators/url_404_probe/core.py +++ b/src/core/tasks/url/operators/probe_404/core.py @@ -2,10 +2,10 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO +from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.core.tasks.url.operators.url_404_probe.tdo import URL404ProbeTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/url_404_probe/tdo.py b/src/core/tasks/url/operators/probe_404/tdo.py similarity index 100% rename from src/core/tasks/url/operators/url_404_probe/tdo.py rename to src/core/tasks/url/operators/probe_404/tdo.py diff --git a/src/core/tasks/url/operators/submit_approved/__init__.py b/src/core/tasks/url/operators/submit_approved/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/submit_approved_url/core.py b/src/core/tasks/url/operators/submit_approved/core.py similarity index 96% rename from src/core/tasks/url/operators/submit_approved_url/core.py rename to src/core/tasks/url/operators/submit_approved/core.py index d2e20c3a..e6b1be9f 100644 --- a/src/core/tasks/url/operators/submit_approved_url/core.py +++ b/src/core/tasks/url/operators/submit_approved/core.py @@ -1,7 +1,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.external.pdap.client import PDAPClient diff --git a/src/core/tasks/url/operators/submit_approved/queries/__init__.py b/src/core/tasks/url/operators/submit_approved/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py similarity index 96% rename from src/core/tasks/url/operators/submit_approved_url/queries/get.py rename to src/core/tasks/url/operators/submit_approved/queries/get.py index ea40ce79..db128326 100644 --- a/src/core/tasks/url/operators/submit_approved_url/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/queries/has_validated.py rename to src/core/tasks/url/operators/submit_approved/queries/has_validated.py diff --git a/src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py similarity index 93% rename from src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py rename to src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index 9c68ec21..347fba11 100644 --- a/src/core/tasks/url/operators/submit_approved_url/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/submit_approved_url/tdo.py b/src/core/tasks/url/operators/submit_approved/tdo.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved_url/tdo.py rename to src/core/tasks/url/operators/submit_approved/tdo.py diff --git a/src/core/tasks/url/operators/url_html/core.py b/src/core/tasks/url/operators/url_html/core.py deleted file mode 100644 index 39a09546..00000000 --- a/src/core/tasks/url/operators/url_html/core.py +++ /dev/null @@ -1,149 +0,0 @@ -from http import HTTPStatus - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo -from src.db.dtos.url.raw_html import RawHTMLInfo -from src.db.enums import TaskType -from src.core.tasks.url.operators.url_html.tdo import UrlHtmlTDO -from src.core.tasks.url.operators.url_html.content_info_getter import HTMLContentInfoGetter -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface - - -class URLHTMLTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - url_request_interface: URLRequestInterface, - adb_client: AsyncDatabaseClient, - html_parser: HTMLResponseParser - ): - super().__init__(adb_client) - self.url_request_interface = url_request_interface - self.html_parser = html_parser - - @property - def task_type(self): - return TaskType.HTML - - async def meets_task_prerequisites(self): - return await self.adb_client.has_pending_urls_without_html_data() - - async def inner_task_logic(self): - tdos = await self.get_pending_urls_without_html_data() - url_ids = [task_info.url_info.id for task_info in tdos] - await self.link_urls_to_task(url_ids=url_ids) - await self.get_raw_html_data_for_urls(tdos) - success_subset, error_subset = await self.separate_success_and_error_subsets(tdos) - non_404_error_subset, is_404_error_subset = await self.separate_error_and_404_subsets(error_subset) - await self.process_html_data(success_subset) - await self.update_database(is_404_error_subset, non_404_error_subset, success_subset) - - async def update_database( - self, - is_404_error_subset: list[UrlHtmlTDO], - non_404_error_subset: list[UrlHtmlTDO], - success_subset: list[UrlHtmlTDO] - ): - await self.update_errors_in_database(non_404_error_subset) - await self.update_404s_in_database(is_404_error_subset) - await self.update_html_data_in_database(success_subset) - - async def get_just_urls(self, tdos: list[UrlHtmlTDO]): - return [task_info.url_info.url for task_info in tdos] - - async def get_pending_urls_without_html_data(self): - pending_urls: list[URLInfo] = await self.adb_client.get_pending_urls_without_html_data() - tdos = [ - UrlHtmlTDO( - url_info=url_info, - ) for url_info in pending_urls - ] - return tdos - - async def get_raw_html_data_for_urls(self, tdos: list[UrlHtmlTDO]): - just_urls = await self.get_just_urls(tdos) - url_response_infos = await self.url_request_interface.make_requests_with_html(just_urls) - for tdto, url_response_info in zip(tdos, url_response_infos): - tdto.url_response_info = url_response_info - - async def separate_success_and_error_subsets( - self, - tdos: list[UrlHtmlTDO] - ) -> tuple[ - list[UrlHtmlTDO], # Successful - list[UrlHtmlTDO] # Error - ]: - errored_tdos = [] - successful_tdos = [] - for tdto in tdos: - if not tdto.url_response_info.success: - errored_tdos.append(tdto) - else: - successful_tdos.append(tdto) - return successful_tdos, errored_tdos - - async def separate_error_and_404_subsets( - self, - tdos: list[UrlHtmlTDO] - ) -> tuple[ - list[UrlHtmlTDO], # Error - list[UrlHtmlTDO] # 404 - ]: - tdos_error = [] - tdos_404 = [] - for tdo in tdos: - if tdo.url_response_info.status is None: - tdos_error.append(tdo) - continue - if tdo.url_response_info.status == HTTPStatus.NOT_FOUND: - tdos_404.append(tdo) - else: - tdos_error.append(tdo) - return tdos_error, tdos_404 - - async def update_404s_in_database(self, tdos_404: list[UrlHtmlTDO]): - url_ids = [tdo.url_info.id for tdo in tdos_404] - await self.adb_client.mark_all_as_404(url_ids) - - async def update_errors_in_database(self, error_tdos: list[UrlHtmlTDO]): - error_infos = [] - for error_tdo in error_tdos: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=error_tdo.url_info.id, - error=str(error_tdo.url_response_info.exception), - ) - error_infos.append(error_info) - await self.adb_client.add_url_error_infos(error_infos) - - async def process_html_data(self, tdos: list[UrlHtmlTDO]): - for tdto in tdos: - - html_tag_info = await self.html_parser.parse( - url=tdto.url_info.url, - html_content=tdto.url_response_info.html, - content_type=tdto.url_response_info.content_type - ) - tdto.html_tag_info = html_tag_info - - async def update_html_data_in_database(self, tdos: list[UrlHtmlTDO]): - html_content_infos = [] - raw_html_data = [] - for tdto in tdos: - hcig = HTMLContentInfoGetter( - response_html_info=tdto.html_tag_info, - url_id=tdto.url_info.id - ) - rhi = RawHTMLInfo( - url_id=tdto.url_info.id, - html=tdto.url_response_info.html - ) - raw_html_data.append(rhi) - results = hcig.get_all_html_content() - html_content_infos.extend(results) - - await self.adb_client.add_html_content_infos(html_content_infos) - await self.adb_client.add_raw_html(raw_html_data) diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py b/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py deleted file mode 100644 index f45780cb..00000000 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/core.py +++ /dev/null @@ -1,80 +0,0 @@ -from http import HTTPStatus -from typing import Optional - -from aiohttp import ClientSession, ClientResponseError -from playwright.async_api import async_playwright -from tqdm.asyncio import tqdm - -from src.core.tasks.url.operators.url_html.scraper.request_interface.constants import HTML_CONTENT_TYPE -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.request_resources import RequestResources -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo - - -class URLRequestInterface: - - async def get_response(self, session: ClientSession, url: str) -> URLResponseInfo: - try: - async with session.get(url, timeout=20) as response: - response.raise_for_status() - text = await response.text() - return URLResponseInfo( - success=True, - html=text, - content_type=response.headers.get("content-type"), - status=HTTPStatus(response.status) - ) - except ClientResponseError as e: - return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) - except Exception as e: - print(f"An error occurred while fetching {url}: {e}") - return URLResponseInfo(success=False, exception=str(e)) - - async def fetch_and_render(self, rr: RequestResources, url: str) -> Optional[URLResponseInfo]: - simple_response = await self.get_response(rr.session, url) - if not simple_response.success: - return simple_response - - if simple_response.content_type != HTML_CONTENT_TYPE: - return simple_response - - return await self.get_dynamic_html_content(rr, url) - - async def get_dynamic_html_content(self, rr, url): - # For HTML responses, attempt to load the page to check for dynamic html content - async with rr.semaphore: - page = await rr.browser.new_page() - try: - await page.goto(url) - await page.wait_for_load_state("networkidle") - html_content = await page.content() - return URLResponseInfo( - success=True, - html=html_content, - content_type=HTML_CONTENT_TYPE, - status=HTTPStatus.OK - ) - except Exception as e: - return URLResponseInfo(success=False, exception=str(e)) - finally: - await page.close() - - async def fetch_urls(self, urls: list[str]) -> list[URLResponseInfo]: - async with ClientSession() as session: - async with async_playwright() as playwright: - browser = await playwright.chromium.launch(headless=True) - request_resources = RequestResources(session=session, browser=browser) - tasks = [self.fetch_and_render(request_resources, url) for url in urls] - results = await tqdm.gather(*tasks) - return results - - async def make_requests_with_html( - self, - urls: list[str], - ) -> list[URLResponseInfo]: - return await self.fetch_urls(urls) - - async def make_simple_requests(self, urls: list[str]) -> list[URLResponseInfo]: - async with ClientSession() as session: - tasks = [self.get_response(session, url) for url in urls] - results = await tqdm.gather(*tasks) - return results diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/url_response.py b/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/url_response.py deleted file mode 100644 index 8e17c078..00000000 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/url_response.py +++ /dev/null @@ -1,12 +0,0 @@ -from http import HTTPStatus -from typing import Optional - -from pydantic import BaseModel - - -class URLResponseInfo(BaseModel): - success: bool - status: Optional[HTTPStatus] = None - html: Optional[str] = None - content_type: Optional[str] = None - exception: Optional[str] = None diff --git a/src/core/tasks/url/operators/url_html/tdo.py b/src/core/tasks/url/operators/url_html/tdo.py deleted file mode 100644 index 326412a3..00000000 --- a/src/core/tasks/url/operators/url_html/tdo.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo - - -class UrlHtmlTDO(BaseModel): - url_info: URLInfo - url_response_info: Optional[URLResponseInfo] = None - html_tag_info: Optional[ResponseHTMLInfo] = None - diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py index 0f183f78..e060d0d3 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/auto_googler.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py index 7b38504d..3ca7357b 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/base.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO class MiscellaneousMetadataSubtaskBase(ABC): diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py index 90512e2b..ef60b48c 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/ckan.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py b/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py index bb3eaadf..18a749b7 100644 --- a/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py +++ b/src/core/tasks/url/subtasks/miscellaneous_metadata/muckrock.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.tasks.url.subtasks.miscellaneous_metadata.base import \ MiscellaneousMetadataSubtaskBase diff --git a/src/db/client/async_.py b/src/db/client/async_.py index d4368dd7..25b40852 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -77,19 +77,21 @@ HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.get import GetValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.has_validated import HasValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.core.tasks.url.operators.url_404_probe.tdo import URL404ProbeTDO -from src.core.tasks.url.operators.url_duplicate.tdo import URLDuplicateTDO -from src.core.tasks.url.operators.url_html.queries.get_pending_urls_without_html_data import \ +from src.core.tasks.url.operators.probe.queries.get_urls import GetURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe.queries.has_urls import HasURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO +from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO +from src.core.tasks.url.operators.html.queries.get import \ GetPendingURLsWithoutHTMLDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ +from src.core.tasks.url.operators.misc_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ GetPendingURLsMissingMiscellaneousDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ +from src.core.tasks.url.operators.misc_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager @@ -118,13 +120,13 @@ from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion @@ -134,7 +136,8 @@ from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion -from src.db.models.templates import Base +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.templates_.base import Base from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder @@ -142,11 +145,13 @@ GetMetricsURLSAggregatedPendingQueryBuilder from src.db.statement_composer import StatementComposer from src.db.templates.markers.bulk.delete import BulkDeletableModel +from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo + class AsyncDatabaseClient: def __init__(self, db_url: Optional[str] = None): if db_url is None: @@ -184,7 +189,6 @@ async def wrapper(self, *args, **kwargs): return wrapper - @session_manager async def execute(self, session: AsyncSession, statement): await session.execute(statement) @@ -236,6 +240,15 @@ async def bulk_delete( ): return await sh.bulk_delete(session, models) + @session_manager + async def bulk_insert( + self, + session: AsyncSession, + models: list[BulkInsertableModel], + return_ids: bool = False + ) -> list[int] | None: + return await sh.bulk_insert(session, models=models, return_ids=return_ids) + @session_manager async def scalar(self, session: AsyncSession, statement): """Fetch the first column of the first row.""" @@ -480,8 +493,8 @@ async def add_html_content_infos(self, session: AsyncSession, html_content_infos await self._add_models(session, URLHTMLContent, html_content_infos) @session_manager - async def has_pending_urls_without_html_data(self, session: AsyncSession) -> bool: - statement = self.statement_composer.pending_urls_without_html_data() + async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: + statement = self.statement_composer.has_non_errored_urls_without_html_data() statement = statement.limit(1) scalar_result = await session.scalars(statement) return bool(scalar_result.first()) @@ -522,7 +535,7 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL ) session.add(metadata_object) - async def get_pending_urls_without_html_data(self) -> list[URLInfo]: + async def get_non_errored_urls_without_html_data(self) -> list[URLInfo]: return await self.run_query_builder(GetPendingURLsWithoutHTMLDataQueryBuilder()) async def get_urls_with_html_data_and_without_models( @@ -554,7 +567,6 @@ async def get_urls_with_html_data_and_without_auto_record_type_suggestion( model=AutoRecordTypeSuggestion ) - async def has_urls_with_html_data_and_without_models( self, session: AsyncSession, @@ -596,7 +608,6 @@ async def get_all( """Get all records of a model. Used primarily in testing.""" return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) - @session_manager async def load_root_url_cache(self, session: AsyncSession) -> dict[str, str]: statement = select(RootURL) @@ -620,7 +631,6 @@ async def get_urls( page=page, errors=errors )) - @session_manager async def initiate_task( self, @@ -732,7 +742,6 @@ async def get_urls_without_agency_suggestions( """Retrieve URLs without confirmed or suggested agencies.""" return await self.run_query_builder(GetPendingURLsWithoutAgencySuggestionsQueryBuilder()) - async def get_next_url_agency_for_annotation( self, user_id: int, @@ -743,7 +752,6 @@ async def get_next_url_agency_for_annotation( batch_id=batch_id )) - @session_manager async def upsert_new_agencies( self, @@ -765,7 +773,6 @@ async def upsert_new_agencies( agency.locality = suggestion.locality session.add(agency) - @session_manager async def add_confirmed_agency_url_links( self, @@ -865,7 +872,6 @@ async def reject_url( rejection_reason=rejection_reason )) - @session_manager async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]: """Retrieve a batch by ID.""" @@ -886,7 +892,11 @@ async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo] )) @session_manager - async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: + async def insert_url( + self, + session: AsyncSession, + url_info: URLInfo + ) -> int: """Insert a new URL into the database.""" url_entry = URL( url=url_info.url, @@ -905,21 +915,33 @@ async def insert_url(self, session: AsyncSession, url_info: URLInfo) -> int: return url_entry.id @session_manager - async def get_url_info_by_url(self, session: AsyncSession, url: str) -> Optional[URLInfo]: + async def get_url_info_by_url( + self, + session: AsyncSession, + url: str + ) -> URLInfo | None: query = Select(URL).where(URL.url == url) raw_result = await session.execute(query) url = raw_result.scalars().first() return URLInfo(**url.__dict__) @session_manager - async def get_url_info_by_id(self, session: AsyncSession, url_id: int) -> Optional[URLInfo]: + async def get_url_info_by_id( + self, + session: AsyncSession, + url_id: int + ) -> URLInfo | None: query = Select(URL).where(URL.id == url_id) raw_result = await session.execute(query) url = raw_result.scalars().first() return URLInfo(**url.__dict__) @session_manager - async def insert_logs(self, session, log_infos: List[LogInfo]): + async def insert_logs( + self, + session: AsyncSession, + log_infos: list[LogInfo] + ) -> None: for log_info in log_infos: log = Log(log=log_info.log, batch_id=log_info.batch_id) if log_info.created_at is not None: @@ -927,7 +949,11 @@ async def insert_logs(self, session, log_infos: List[LogInfo]): session.add(log) @session_manager - async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsertInfo]): + async def insert_duplicates( + self, + session: AsyncSession, + duplicate_infos: list[DuplicateInsertInfo] + ) -> None: for duplicate_info in duplicate_infos: duplicate = Duplicate( batch_id=duplicate_info.duplicate_batch_id, @@ -936,7 +962,11 @@ async def insert_duplicates(self, session, duplicate_infos: list[DuplicateInsert session.add(duplicate) @session_manager - async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> int: + async def insert_batch( + self, + session: AsyncSession, + batch_info: BatchInfo + ) -> int: """Insert a new batch into the database and return its ID.""" batch = Batch( strategy=batch_info.strategy, @@ -956,7 +986,11 @@ async def insert_batch(self, session: AsyncSession, batch_info: BatchInfo) -> in await session.flush() return batch.id - async def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: + async def insert_urls( + self, + url_infos: list[URLInfo], + batch_id: int + ) -> InsertURLsInfo: url_mappings = [] duplicates = [] for url_info in url_infos: @@ -984,14 +1018,14 @@ async def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertUR @session_manager async def update_batch_post_collection( self, - session, + session: AsyncSession, batch_id: int, total_url_count: int, original_url_count: int, duplicate_url_count: int, batch_status: BatchStatus, compute_time: float = None, - ): + ) -> None: query = Select(Batch).where(Batch.id == batch_id) result = await session.execute(query) @@ -1057,7 +1091,7 @@ async def delete_old_logs(self): async def get_next_url_for_all_annotations( self, batch_id: int | None = None - ) -> GetNextURLForAllAnnotationResponse: + ) -> GetNextURLForAllAnnotationResponse: return await self.run_query_builder(GetNextURLForAllAnnotationQueryBuilder(batch_id)) @session_manager @@ -1106,7 +1140,6 @@ async def upload_manual_batch( dto=dto )) - @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: query = select(URL).where(URL.url == url) @@ -1127,7 +1160,6 @@ async def get_batches_aggregated_metrics(self) -> GetMetricsBatchesAggregatedRes GetBatchesAggregatedMetricsQueryBuilder() ) - async def get_batches_breakdown_metrics( self, page: int @@ -1414,6 +1446,8 @@ async def mark_all_as_duplicates(self, url_ids: List[int]): async def mark_all_as_404(self, url_ids: List[int]): query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.NOT_FOUND.value) await self.execute(query) + query = update(URLWebMetadata).where(URLWebMetadata.url_id.in_(url_ids)).values(status_code=404) + await self.execute(query) async def mark_all_as_recently_probed_for_404( self, @@ -1571,3 +1605,13 @@ async def check_valid_urls_updated(self) -> bool: async def get_current_database_time(self) -> datetime: return await self.scalar(select(func.now())) + + async def has_urls_without_probe(self) -> bool: + return await self.run_query_builder( + HasURLsWithoutProbeQueryBuilder() + ) + + async def get_urls_without_probe(self) -> list[URLMapping]: + return await self.run_query_builder( + GetURLsWithoutProbeQueryBuilder() + ) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 866feb25..613c335b 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -11,16 +11,16 @@ from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.log.pydantic.info import LogInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.templates import Base +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.templates_.base import Base from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate from src.db.models.instantiations.log.sqlalchemy import Log from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index ed2d361c..869b8978 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -5,15 +5,16 @@ from src.api.endpoints.review.next.dto import FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, \ FinalReviewAnnotationAgencyAutoInfo, FinalReviewAnnotationAgencyInfo from src.core.enums import RecordType, SuggestionType -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING -from src.db.dtos.url.html_content import HTMLContentType, URLHTMLContentInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion diff --git a/src/db/dtos/url/html_content.py b/src/db/dtos/url/html_content.py index f8b24eb0..1d3d67bf 100644 --- a/src/db/dtos/url/html_content.py +++ b/src/db/dtos/url/html_content.py @@ -1,21 +1,15 @@ -from enum import Enum -from typing import Optional +from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel -from pydantic import BaseModel - -class HTMLContentType(Enum): - TITLE = "Title" - DESCRIPTION = "Description" - H1 = "H1" - H2 = "H2" - H3 = "H3" - H4 = "H4" - H5 = "H5" - H6 = "H6" - DIV = "Div" - -class URLHTMLContentInfo(BaseModel): - url_id: Optional[int] = None +class URLHTMLContentInfo(BulkInsertableModel): + url_id: int | None = None content_type: HTMLContentType - content: str | list[str] \ No newline at end of file + content: str | list[str] + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLHTMLContent \ No newline at end of file diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping.py index 38efbce4..18fc5be2 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping.py @@ -2,5 +2,6 @@ class URLMapping(BaseModel): + """Mapping between url and url_id.""" url: str url_id: int diff --git a/src/db/enums.py b/src/db/enums.py index 6c1d1496..c8ed9840 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -44,6 +44,7 @@ class TaskType(PyEnum): SYNC_AGENCIES = "Sync Agencies" SYNC_DATA_SOURCES = "Sync Data Sources" PUSH_TO_HUGGINGFACE = "Push to Hugging Face" + PROBE_URL = "URL Probe" class ChangeLogOperationType(PyEnum): INSERT = "INSERT" diff --git a/src/db/helpers/session/parser.py b/src/db/helpers/session/parser.py index bc822022..b580dcd1 100644 --- a/src/db/helpers/session/parser.py +++ b/src/db/helpers/session/parser.py @@ -1,5 +1,5 @@ from src.db.helpers.session.types import BulkActionType -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.protocols.sa_correlated.core import SQLAlchemyCorrelatedProtocol from src.db.templates.protocols.sa_correlated.with_id import SQLAlchemyCorrelatedWithIDProtocol from src.db.utils.validate import validate_all_models_of_same_type diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 2b3776c1..a616664f 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -11,7 +11,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session.parser import BulkActionParser -from src.db.models.templates import Base, StandardBase +from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel @@ -43,6 +44,10 @@ async def mappings(session: AsyncSession, query: sa.Select) -> Sequence[sa.RowMa raw_result = await session.execute(query) return raw_result.mappings().all() +async def has_results(session: AsyncSession, query: sa.Select) -> bool: + raw_result = await session.execute(query) + return raw_result.first() is not None + async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], @@ -88,7 +93,7 @@ async def add( async def add_all( session: AsyncSession, - models: list[StandardBase], + models: list[WithIDBase], return_ids: bool = False ) -> list[int] | None: session.add_all(models) diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/instantiations/agency/pydantic/upsert.py index 9a869e84..1deeb6b5 100644 --- a/src/db/models/instantiations/agency/pydantic/upsert.py +++ b/src/db/models/instantiations/agency/pydantic/upsert.py @@ -1,7 +1,7 @@ from datetime import datetime from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.upsert import BulkUpsertableModel diff --git a/src/db/models/instantiations/agency/sqlalchemy.py b/src/db/models/instantiations/agency/sqlalchemy.py index 2ce3676f..8310eeac 100644 --- a/src/db/models/instantiations/agency/sqlalchemy.py +++ b/src/db/models/instantiations/agency/sqlalchemy.py @@ -6,13 +6,13 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import Base, StandardBase +from src.db.models.templates_.with_id import WithIDBase class Agency( CreatedAtMixin, # When agency was added to database UpdatedAtMixin, # When agency was last updated in database - StandardBase + WithIDBase ): __tablename__ = "agencies" diff --git a/src/db/models/instantiations/backlog_snapshot.py b/src/db/models/instantiations/backlog_snapshot.py index 89645160..6b0982cd 100644 --- a/src/db/models/instantiations/backlog_snapshot.py +++ b/src/db/models/instantiations/backlog_snapshot.py @@ -1,10 +1,10 @@ from sqlalchemy import Column, Integer from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class BacklogSnapshot(CreatedAtMixin, StandardBase): +class BacklogSnapshot(CreatedAtMixin, WithIDBase): __tablename__ = "backlog_snapshot" count_pending_total = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/batch/sqlalchemy.py b/src/db/models/instantiations/batch/sqlalchemy.py index b001dbac..0e6aa611 100644 --- a/src/db/models/instantiations/batch/sqlalchemy.py +++ b/src/db/models/instantiations/batch/sqlalchemy.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import batch_status_enum -class Batch(StandardBase): +class Batch(WithIDBase): __tablename__ = 'batches' strategy = Column( diff --git a/src/db/models/instantiations/change_log.py b/src/db/models/instantiations/change_log.py index 975958ab..0cb74659 100644 --- a/src/db/models/instantiations/change_log.py +++ b/src/db/models/instantiations/change_log.py @@ -5,10 +5,10 @@ from src.db.enums import ChangeLogOperationType from src.db.models.mixins import CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class ChangeLog(CreatedAtMixin, StandardBase): +class ChangeLog(CreatedAtMixin, WithIDBase): __tablename__ = "change_log" diff --git a/src/db/models/instantiations/duplicate/sqlalchemy.py b/src/db/models/instantiations/duplicate/sqlalchemy.py index 67df3af5..03c492e3 100644 --- a/src/db/models/instantiations/duplicate/sqlalchemy.py +++ b/src/db/models/instantiations/duplicate/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Duplicate(BatchDependentMixin, StandardBase): +class Duplicate(BatchDependentMixin, WithIDBase): """ Identifies duplicates which occur within a batch """ diff --git a/src/db/models/instantiations/link/batch_url.py b/src/db/models/instantiations/link/batch_url.py index d86b0703..8fb8f42e 100644 --- a/src/db/models/instantiations/link/batch_url.py +++ b/src/db/models/instantiations/link/batch_url.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin, BatchDependentMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class LinkBatchURL( @@ -9,7 +9,7 @@ class LinkBatchURL( CreatedAtMixin, URLDependentMixin, BatchDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "link_batch_urls" diff --git a/src/db/models/instantiations/link/task_url.py b/src/db/models/instantiations/link/task_url.py index 02ef02c3..2535d317 100644 --- a/src/db/models/instantiations/link/task_url.py +++ b/src/db/models/instantiations/link/task_url.py @@ -1,6 +1,6 @@ from sqlalchemy import UniqueConstraint, Column, Integer, ForeignKey -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class LinkTaskURL(Base): diff --git a/src/db/models/instantiations/link/url_agency/sqlalchemy.py b/src/db/models/instantiations/link/url_agency/sqlalchemy.py index 28e42924..f8d72065 100644 --- a/src/db/models/instantiations/link/url_agency/sqlalchemy.py +++ b/src/db/models/instantiations/link/url_agency/sqlalchemy.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class LinkURLAgency(URLDependentMixin, StandardBase): +class LinkURLAgency(URLDependentMixin, WithIDBase): __tablename__ = "link_urls_agencies" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/instantiations/log/sqlalchemy.py b/src/db/models/instantiations/log/sqlalchemy.py index 769391cf..60f17875 100644 --- a/src/db/models/instantiations/log/sqlalchemy.py +++ b/src/db/models/instantiations/log/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Log(CreatedAtMixin, BatchDependentMixin, StandardBase): +class Log(CreatedAtMixin, BatchDependentMixin, WithIDBase): __tablename__ = 'logs' log = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/missing.py b/src/db/models/instantiations/missing.py index 05665eba..6ad868df 100644 --- a/src/db/models/instantiations/missing.py +++ b/src/db/models/instantiations/missing.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import BatchDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class Missing(BatchDependentMixin, StandardBase): +class Missing(BatchDependentMixin, WithIDBase): __tablename__ = 'missing' place_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/root_url_cache.py b/src/db/models/instantiations/root_url_cache.py index 4ebadd50..f79e4b5c 100644 --- a/src/db/models/instantiations/root_url_cache.py +++ b/src/db/models/instantiations/root_url_cache.py @@ -1,10 +1,10 @@ from sqlalchemy import UniqueConstraint, Column, String from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class RootURL(UpdatedAtMixin, StandardBase): +class RootURL(UpdatedAtMixin, WithIDBase): __tablename__ = 'root_url_cache' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/state/huggingface.py b/src/db/models/instantiations/state/huggingface.py index 58e54cdc..d858dc0a 100644 --- a/src/db/models/instantiations/state/huggingface.py +++ b/src/db/models/instantiations/state/huggingface.py @@ -1,6 +1,6 @@ from sqlalchemy import Column, Integer, DateTime -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class HuggingFaceUploadState(Base): diff --git a/src/db/models/instantiations/state/sync/agencies.py b/src/db/models/instantiations/state/sync/agencies.py index 207a2936..7ee1babe 100644 --- a/src/db/models/instantiations/state/sync/agencies.py +++ b/src/db/models/instantiations/state/sync/agencies.py @@ -4,7 +4,7 @@ from sqlalchemy import DateTime, Date, Integer, Column -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class AgenciesSyncState(Base): diff --git a/src/db/models/instantiations/state/sync/data_sources.py b/src/db/models/instantiations/state/sync/data_sources.py index cf173860..333d0945 100644 --- a/src/db/models/instantiations/state/sync/data_sources.py +++ b/src/db/models/instantiations/state/sync/data_sources.py @@ -1,6 +1,6 @@ from sqlalchemy import Integer, Column, DateTime, Date -from src.db.models.templates import Base +from src.db.models.templates_.base import Base class DataSourcesSyncState(Base): diff --git a/src/db/models/instantiations/task/core.py b/src/db/models/instantiations/task/core.py index 514301c8..291a5d0a 100644 --- a/src/db/models/instantiations/task/core.py +++ b/src/db/models/instantiations/task/core.py @@ -3,11 +3,11 @@ from src.db.enums import PGEnum, TaskType from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import batch_status_enum -class Task(UpdatedAtMixin, StandardBase): +class Task(UpdatedAtMixin, WithIDBase): __tablename__ = 'tasks' task_type = Column( diff --git a/src/db/models/instantiations/task/error.py b/src/db/models/instantiations/task/error.py index 03014904..c5a25e78 100644 --- a/src/db/models/instantiations/task/error.py +++ b/src/db/models/instantiations/task/error.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, StandardBase): +class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): __tablename__ = 'task_errors' error = Column(Text, nullable=False) diff --git a/src/db/models/instantiations/url/checked_for_duplicate.py b/src/db/models/instantiations/url/checked_for_duplicate.py index 9443d0ac..bb7cf666 100644 --- a/src/db/models/instantiations/url/checked_for_duplicate.py +++ b/src/db/models/instantiations/url/checked_for_duplicate.py @@ -1,10 +1,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, StandardBase): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = 'url_checked_for_duplicate' # Relationships diff --git a/src/db/models/instantiations/url/core/pydantic.py b/src/db/models/instantiations/url/core/pydantic.py deleted file mode 100644 index e409c32c..00000000 --- a/src/db/models/instantiations/url/core/pydantic.py +++ /dev/null @@ -1,17 +0,0 @@ -import datetime -from typing import Optional - -from pydantic import BaseModel - -from src.collectors.enums import URLStatus - - -class URLInfo(BaseModel): - id: Optional[int] = None - batch_id: Optional[int] = None - url: str - collector_metadata: Optional[dict] = None - outcome: URLStatus = URLStatus.PENDING - updated_at: Optional[datetime.datetime] = None - created_at: Optional[datetime.datetime] = None - name: Optional[str] = None diff --git a/src/db/models/instantiations/url/core/pydantic/__init__.py b/src/db/models/instantiations/url/core/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/core/pydantic/info.py b/src/db/models/instantiations/url/core/pydantic/info.py new file mode 100644 index 00000000..6099db29 --- /dev/null +++ b/src/db/models/instantiations/url/core/pydantic/info.py @@ -0,0 +1,17 @@ +import datetime +from typing import Optional + +from pydantic import BaseModel + +from src.collectors.enums import URLStatus + + +class URLInfo(BaseModel): + id: int | None = None + batch_id: int | None= None + url: str + collector_metadata: dict | None = None + outcome: URLStatus = URLStatus.PENDING + updated_at: datetime.datetime | None = None + created_at: datetime.datetime | None = None + name: str | None = None diff --git a/src/db/models/instantiations/url/core/pydantic/insert.py b/src/db/models/instantiations/url/core/pydantic/insert.py new file mode 100644 index 00000000..e384416e --- /dev/null +++ b/src/db/models/instantiations/url/core/pydantic/insert.py @@ -0,0 +1,19 @@ +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLInsertModel(BulkInsertableModel): + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URL + + url: str + collector_metadata: dict | None = None + name: str + outcome: URLStatus + record_type: RecordType \ No newline at end of file diff --git a/src/db/models/instantiations/url/core/sqlalchemy.py b/src/db/models/instantiations/url/core/sqlalchemy.py index 8a476071..4b4c0159 100644 --- a/src/db/models/instantiations/url/core/sqlalchemy.py +++ b/src/db/models/instantiations/url/core/sqlalchemy.py @@ -1,16 +1,14 @@ -from sqlalchemy import Column, Integer, ForeignKey, Text, String, JSON, Enum -from sqlalchemy.dialects import postgresql +from sqlalchemy import Column, Text, String, JSON from sqlalchemy.orm import relationship from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.helpers import enum_column from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardBase -from src.db.models.types import record_type_values +from src.db.models.templates_.with_id import WithIDBase -class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): +class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): __tablename__ = 'urls' # The batch this URL is associated with @@ -84,4 +82,12 @@ class URL(UpdatedAtMixin, CreatedAtMixin, StandardBase): "URLCompressedHTML", uselist=False, back_populates="url" + ) + scrape_info = relationship( + "URLScrapeInfo", + uselist=False, + ) + web_metadata = relationship( + "URLWebMetadata", + uselist=False, ) \ No newline at end of file diff --git a/src/db/models/instantiations/url/data_source/sqlalchemy.py b/src/db/models/instantiations/url/data_source/sqlalchemy.py index b5bdb40d..270ba7e3 100644 --- a/src/db/models/instantiations/url/data_source/sqlalchemy.py +++ b/src/db/models/instantiations/url/data_source/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, StandardBase): +class URLDataSource(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "url_data_sources" data_source_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/error_info/pydantic.py b/src/db/models/instantiations/url/error_info/pydantic.py index 46f5b9fa..c8596a13 100644 --- a/src/db/models/instantiations/url/error_info/pydantic.py +++ b/src/db/models/instantiations/url/error_info/pydantic.py @@ -3,9 +3,17 @@ from pydantic import BaseModel +from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel -class URLErrorPydanticInfo(BaseModel): + +class URLErrorPydanticInfo(BulkInsertableModel): task_id: int url_id: int error: str - updated_at: Optional[datetime.datetime] = None \ No newline at end of file + updated_at: datetime.datetime = None + + @classmethod + def sa_model(cls) -> type[Base]: + return URLErrorInfo \ No newline at end of file diff --git a/src/db/models/instantiations/url/error_info/sqlalchemy.py b/src/db/models/instantiations/url/error_info/sqlalchemy.py index 8825777f..59f6c263 100644 --- a/src/db/models/instantiations/url/error_info/sqlalchemy.py +++ b/src/db/models/instantiations/url/error_info/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, StandardBase): +class URLErrorInfo(UpdatedAtMixin, TaskDependentMixin, URLDependentMixin, WithIDBase): __tablename__ = 'url_error_info' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/html/__init__.py b/src/db/models/instantiations/url/html/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/compressed/__init__.py b/src/db/models/instantiations/url/html/compressed/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/compressed/pydantic.py b/src/db/models/instantiations/url/html/compressed/pydantic.py new file mode 100644 index 00000000..b626b5c2 --- /dev/null +++ b/src/db/models/instantiations/url/html/compressed/pydantic.py @@ -0,0 +1,13 @@ +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLCompressedHTMLPydantic(BulkInsertableModel): + url_id: int + compressed_html: bytes + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLCompressedHTML \ No newline at end of file diff --git a/src/db/models/instantiations/url/compressed_html.py b/src/db/models/instantiations/url/html/compressed/sqlalchemy.py similarity index 86% rename from src/db/models/instantiations/url/compressed_html.py rename to src/db/models/instantiations/url/html/compressed/sqlalchemy.py index 92e340a5..995c5b25 100644 --- a/src/db/models/instantiations/url/compressed_html.py +++ b/src/db/models/instantiations/url/html/compressed/sqlalchemy.py @@ -2,13 +2,13 @@ from sqlalchemy.orm import relationship, Mapped from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = 'url_compressed_html' diff --git a/src/db/models/instantiations/url/html/content/__init__.py b/src/db/models/instantiations/url/html/content/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html/content/enums.py b/src/db/models/instantiations/url/html/content/enums.py new file mode 100644 index 00000000..13820352 --- /dev/null +++ b/src/db/models/instantiations/url/html/content/enums.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class HTMLContentType(Enum): + TITLE = "Title" + DESCRIPTION = "Description" + H1 = "H1" + H2 = "H2" + H3 = "H3" + H4 = "H4" + H5 = "H5" + H6 = "H6" + DIV = "Div" diff --git a/src/db/models/instantiations/url/html/content/pydantic.py b/src/db/models/instantiations/url/html/content/pydantic.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/html_content.py b/src/db/models/instantiations/url/html/content/sqlalchemy.py similarity index 82% rename from src/db/models/instantiations/url/html_content.py rename to src/db/models/instantiations/url/html/content/sqlalchemy.py index b23af35c..63e4da76 100644 --- a/src/db/models/instantiations/url/html_content.py +++ b/src/db/models/instantiations/url/html/content/sqlalchemy.py @@ -3,10 +3,14 @@ from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLHTMLContent(UpdatedAtMixin, URLDependentMixin, StandardBase): +class URLHTMLContent( + UpdatedAtMixin, + URLDependentMixin, + WithIDBase +): __tablename__ = 'url_html_content' __table_args__ = (UniqueConstraint( "url_id", diff --git a/src/db/models/instantiations/url/optional_data_source_metadata.py b/src/db/models/instantiations/url/optional_data_source_metadata.py index fac99828..bb2a95e5 100644 --- a/src/db/models/instantiations/url/optional_data_source_metadata.py +++ b/src/db/models/instantiations/url/optional_data_source_metadata.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLOptionalDataSourceMetadata(URLDependentMixin, StandardBase): +class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) diff --git a/src/db/models/instantiations/url/probed_for_404.py b/src/db/models/instantiations/url/probed_for_404.py index b795b628..478ce9de 100644 --- a/src/db/models/instantiations/url/probed_for_404.py +++ b/src/db/models/instantiations/url/probed_for_404.py @@ -2,10 +2,10 @@ from src.db.models.helpers import get_created_at_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class URLProbedFor404(URLDependentMixin, StandardBase): +class URLProbedFor404(URLDependentMixin, WithIDBase): __tablename__ = 'url_probed_for_404' last_probed_at = get_created_at_column() diff --git a/src/db/models/instantiations/url/reviewing_user.py b/src/db/models/instantiations/url/reviewing_user.py index 938f86ab..9213a157 100644 --- a/src/db/models/instantiations/url/reviewing_user.py +++ b/src/db/models/instantiations/url/reviewing_user.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, StandardBase): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = 'reviewing_user_url' __table_args__ = ( UniqueConstraint( diff --git a/src/db/models/instantiations/url/scrape_info/__init__.py b/src/db/models/instantiations/url/scrape_info/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/scrape_info/enums.py b/src/db/models/instantiations/url/scrape_info/enums.py new file mode 100644 index 00000000..3e16fff3 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/enums.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class ScrapeStatus(Enum): + SUCCESS = "success" + ERROR = "error" \ No newline at end of file diff --git a/src/db/models/instantiations/url/scrape_info/pydantic.py b/src/db/models/instantiations/url/scrape_info/pydantic.py new file mode 100644 index 00000000..f41b1642 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/pydantic.py @@ -0,0 +1,13 @@ +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLScrapeInfoInsertModel(BulkInsertableModel): + url_id: int + status: ScrapeStatus + + @classmethod + def sa_model(cls) -> type[Base]: + return URLScrapeInfo \ No newline at end of file diff --git a/src/db/models/instantiations/url/scrape_info/sqlalchemy.py b/src/db/models/instantiations/url/scrape_info/sqlalchemy.py new file mode 100644 index 00000000..d97e0b93 --- /dev/null +++ b/src/db/models/instantiations/url/scrape_info/sqlalchemy.py @@ -0,0 +1,17 @@ +from src.db.models.helpers import enum_column +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.standard import StandardBase + + +class URLScrapeInfo( + StandardBase, + URLDependentMixin +): + + __tablename__ = 'url_scrape_info' + + status = enum_column( + enum_type=ScrapeStatus, + name='scrape_status', + ) \ No newline at end of file diff --git a/src/db/models/instantiations/url/suggestion/agency/auto.py b/src/db/models/instantiations/url/suggestion/agency/auto.py index 01585535..5ecfdf0a 100644 --- a/src/db/models/instantiations/url/suggestion/agency/auto.py +++ b/src/db/models/instantiations/url/suggestion/agency/auto.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class AutomatedUrlAgencySuggestion(URLDependentMixin, StandardBase): +class AutomatedUrlAgencySuggestion(URLDependentMixin, WithIDBase): __tablename__ = "automated_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/agency/user.py b/src/db/models/instantiations/url/suggestion/agency/user.py index 5a54399f..7a338fd0 100644 --- a/src/db/models/instantiations/url/suggestion/agency/user.py +++ b/src/db/models/instantiations/url/suggestion/agency/user.py @@ -3,10 +3,10 @@ from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class UserUrlAgencySuggestion(URLDependentMixin, StandardBase): +class UserUrlAgencySuggestion(URLDependentMixin, WithIDBase): __tablename__ = "user_url_agency_suggestions" agency_id = get_agency_id_foreign_column(nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/record_type/auto.py b/src/db/models/instantiations/url/suggestion/record_type/auto.py index 34faf6f3..2aaed526 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/auto.py +++ b/src/db/models/instantiations/url/suggestion/record_type/auto.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values @@ -11,7 +11,7 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/record_type/user.py b/src/db/models/instantiations/url/suggestion/record_type/user.py index 77954509..8fcc816b 100644 --- a/src/db/models/instantiations/url/suggestion/record_type/user.py +++ b/src/db/models/instantiations/url/suggestion/record_type/user.py @@ -3,11 +3,11 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): +class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "user_record_type_suggestions" user_id = Column(Integer, nullable=False) diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py index 982b4449..49dc7457 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py +++ b/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, StandardBase): +class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) diff --git a/src/db/models/instantiations/url/suggestion/relevant/user.py b/src/db/models/instantiations/url/suggestion/relevant/user.py index b087f71e..a0cfed44 100644 --- a/src/db/models/instantiations/url/suggestion/relevant/user.py +++ b/src/db/models/instantiations/url/suggestion/relevant/user.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin -from src.db.models.templates import StandardBase +from src.db.models.templates_.with_id import WithIDBase class UserRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - StandardBase + WithIDBase ): __tablename__ = "user_relevant_suggestions" diff --git a/src/db/models/instantiations/url/web_metadata/__init__.py b/src/db/models/instantiations/url/web_metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/web_metadata/pydantic.py b/src/db/models/instantiations/url/web_metadata/pydantic.py new file mode 100644 index 00000000..c0460437 --- /dev/null +++ b/src/db/models/instantiations/url/web_metadata/pydantic.py @@ -0,0 +1,18 @@ +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLWebMetadataPydantic(BulkInsertableModel): + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URLWebMetadata + + + url_id: int + accessed: bool + status_code: int | None + content_type: str | None + error_message: str | None \ No newline at end of file diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py new file mode 100644 index 00000000..45f5233c --- /dev/null +++ b/src/db/models/instantiations/url/web_metadata/sqlalchemy.py @@ -0,0 +1,33 @@ +from sqlalchemy import Column, Text, Boolean, Integer + +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.with_id import WithIDBase + + +class URLWebMetadata( + WithIDBase, + URLDependentMixin, + CreatedAtMixin, + UpdatedAtMixin +): + """Contains information about the web page.""" + __tablename__ = "url_web_metadata" + + accessed = Column( + Boolean(), + nullable=False + ) + status_code = Column( + Integer(), + nullable=True + ) + content_type = Column( + Text(), + nullable=True + ) + error_message = Column( + Text(), + nullable=True + ) + + diff --git a/src/db/models/templates.py b/src/db/models/templates.py deleted file mode 100644 index 5e738fab..00000000 --- a/src/db/models/templates.py +++ /dev/null @@ -1,11 +0,0 @@ -from sqlalchemy import Integer, Column -from sqlalchemy.orm import declarative_base - -# Base class for SQLAlchemy ORM models -Base = declarative_base() - -class StandardBase(Base): - __abstract__ = True - - id = Column(Integer, primary_key=True, autoincrement=True) - diff --git a/src/db/models/templates_/__init__.py b/src/db/models/templates_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/templates_/base.py b/src/db/models/templates_/base.py new file mode 100644 index 00000000..0ec5f68e --- /dev/null +++ b/src/db/models/templates_/base.py @@ -0,0 +1,4 @@ +"""Base class for SQLAlchemy ORM models.""" +from sqlalchemy.orm import declarative_base + +Base = declarative_base() diff --git a/src/db/models/templates_/standard.py b/src/db/models/templates_/standard.py new file mode 100644 index 00000000..85a01941 --- /dev/null +++ b/src/db/models/templates_/standard.py @@ -0,0 +1,14 @@ +from sqlalchemy import Column, Integer + +from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class StandardBase( + Base, + CreatedAtMixin, + UpdatedAtMixin, +): + __abstract__ = True + + id = Column(Integer, primary_key=True, autoincrement=True) diff --git a/src/db/models/templates_/with_id.py b/src/db/models/templates_/with_id.py new file mode 100644 index 00000000..e454f215 --- /dev/null +++ b/src/db/models/templates_/with_id.py @@ -0,0 +1,11 @@ +from sqlalchemy import Integer, Column + +from src.db.models.templates_.base import Base + + + +class WithIDBase(Base): + __abstract__ = True + + id = Column(Integer, primary_key=True, autoincrement=True) + diff --git a/src/db/queries/implementations/core/get/html_content_info.py b/src/db/queries/implementations/core/get/html_content_info.py index fb26a527..d647acc1 100644 --- a/src/db/queries/implementations/core/get/html_content_info.py +++ b/src/db/queries/implementations/core/get/html_content_info.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 518aafc2..2e9a69e8 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -1,3 +1,4 @@ +from http import HTTPStatus from typing import Any from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement @@ -11,11 +12,13 @@ from src.db.models.instantiations.link.task_url import LinkTaskURL from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.html_content import URLHTMLContent +from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -25,7 +28,7 @@ class StatementComposer: """ @staticmethod - def pending_urls_without_html_data() -> Select: + def has_non_errored_urls_without_html_data() -> Select: exclude_subquery = ( select(1). select_from(LinkTaskURL). @@ -35,11 +38,15 @@ def pending_urls_without_html_data() -> Select: where(Task.task_status == BatchStatus.READY_TO_LABEL.value) ) query = ( - select(URL). - outerjoin(URLHTMLContent). - where(URLHTMLContent.id == None). - where(~exists(exclude_subquery)). - where(URL.outcome == URLStatus.PENDING.value) + select(URL) + .join(URLWebMetadata) + .outerjoin(URLScrapeInfo) + .where( + URLScrapeInfo.id == None, + ~exists(exclude_subquery), + URLWebMetadata.status_code == HTTPStatus.OK.value, + URLWebMetadata.content_type.like("%html%"), + ) .options( selectinload(URL.batch) ) diff --git a/src/db/templates/protocols/sa_correlated/core.py b/src/db/templates/protocols/sa_correlated/core.py index 6b77c835..82475e60 100644 --- a/src/db/templates/protocols/sa_correlated/core.py +++ b/src/db/templates/protocols/sa_correlated/core.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import Protocol, runtime_checkable -from src.db.models.templates import Base +from src.db.models.templates_.base import Base @runtime_checkable diff --git a/src/db/templates/protocols/sa_correlated/with_id.py b/src/db/templates/protocols/sa_correlated/with_id.py index 4e3609e1..7e920e76 100644 --- a/src/db/templates/protocols/sa_correlated/with_id.py +++ b/src/db/templates/protocols/sa_correlated/with_id.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import Protocol, runtime_checkable -from src.db.models.templates import Base +from src.db.models.templates_.base import Base @runtime_checkable diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 1447ae87..ee442600 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -4,7 +4,7 @@ from src.core.tasks.scheduled.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.sync.data_sources.params import DataSourcesSyncParameters -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/README.md b/src/external/url_request/README.md similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/README.md rename to src/external/url_request/README.md diff --git a/src/external/url_request/__init__.py b/src/external/url_request/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/constants.py b/src/external/url_request/constants.py similarity index 100% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/constants.py rename to src/external/url_request/constants.py diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py new file mode 100644 index 00000000..d17164d7 --- /dev/null +++ b/src/external/url_request/core.py @@ -0,0 +1,21 @@ +from aiohttp import ClientSession, ClientTimeout + +from src.external.url_request.dtos.url_response import URLResponseInfo +from src.external.url_request.probe.core import URLProbeManager +from src.external.url_request.probe.model import URLProbeResponse +from src.external.url_request.request import fetch_urls + + +class URLRequestInterface: + + @staticmethod + async def make_requests_with_html( + urls: list[str], + ) -> list[URLResponseInfo]: + return await fetch_urls(urls) + + @staticmethod + async def probe_urls(urls: list[str]) -> list[URLProbeResponse]: + async with ClientSession(timeout=ClientTimeout(total=30)) as session: + manager = URLProbeManager(session=session) + return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/dtos/__init__.py b/src/external/url_request/dtos/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py b/src/external/url_request/dtos/request_resources.py similarity index 74% rename from src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py rename to src/external/url_request/dtos/request_resources.py index 62ad714a..01a5365f 100644 --- a/src/core/tasks/url/operators/url_html/scraper/request_interface/dtos/request_resources.py +++ b/src/external/url_request/dtos/request_resources.py @@ -4,7 +4,7 @@ from aiohttp import ClientSession from playwright.async_api import async_playwright -from src.core.tasks.url.operators.url_html.scraper.request_interface.constants import MAX_CONCURRENCY +from src.external.url_request.constants import MAX_CONCURRENCY @dataclass diff --git a/src/external/url_request/dtos/url_response.py b/src/external/url_request/dtos/url_response.py new file mode 100644 index 00000000..57303a7c --- /dev/null +++ b/src/external/url_request/dtos/url_response.py @@ -0,0 +1,12 @@ +from http import HTTPStatus +from typing import Optional + +from pydantic import BaseModel + + +class URLResponseInfo(BaseModel): + success: bool + status: HTTPStatus | None = None + html: str | None = None + content_type: str | None = None + exception: str | None = None diff --git a/src/external/url_request/probe/__init__.py b/src/external/url_request/probe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py new file mode 100644 index 00000000..0b5bb934 --- /dev/null +++ b/src/external/url_request/probe/core.py @@ -0,0 +1,43 @@ +import asyncio + +from aiohttp import ClientSession, ClientResponseError + +from src.external.url_request.probe.format import format_client_response, format_client_response_error, format_error +from src.external.url_request.probe.model import URLProbeResponse +from tqdm.asyncio import tqdm_asyncio + +class URLProbeManager: + + def __init__( + self, + session: ClientSession + ): + self.session = session + + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: + return await tqdm_asyncio.gather(*[self.probe_url(url) for url in urls]) + + async def probe_url(self, url: str) -> URLProbeResponse: + result = await self.head(url) + if result.error is None: + return result + return await self.get(url) + + + async def head(self, url: str) -> URLProbeResponse: + try: + async with self.session.head(url) as response: + return format_client_response(url, response=response) + except ClientResponseError as e: + return format_client_response_error(url, error=e) + except Exception as e: + return format_error(url, error=e) + + async def get(self, url: str) -> URLProbeResponse: + try: + async with self.session.get(url) as response: + return format_client_response(url, response=response) + except ClientResponseError as e: + return format_client_response_error(url, error=e) + except Exception as e: + return format_error(url, error=e) \ No newline at end of file diff --git a/src/external/url_request/probe/format.py b/src/external/url_request/probe/format.py new file mode 100644 index 00000000..65430c1e --- /dev/null +++ b/src/external/url_request/probe/format.py @@ -0,0 +1,32 @@ +from aiohttp import ClientResponse, ClientResponseError + +from src.external.url_request.probe.model import URLProbeResponse + + +def format_content_type(content_type: str) -> str: + return content_type.split(";")[0].strip() + +def format_client_response(url: str, response: ClientResponse) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=response.status, + content_type=format_content_type( + response.headers.get("content-type") + ) + ) + +def format_client_response_error(url: str, error: ClientResponseError) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=error.status, + content_type=None, + error=str(error) + ) + +def format_error(url: str, error: Exception) -> URLProbeResponse: + return URLProbeResponse( + url=url, + status_code=None, + content_type=None, + error=str(error) + ) \ No newline at end of file diff --git a/src/external/url_request/probe/model.py b/src/external/url_request/probe/model.py new file mode 100644 index 00000000..27caa680 --- /dev/null +++ b/src/external/url_request/probe/model.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel, model_validator + + +class URLProbeResponse(BaseModel): + url: str + status_code: int | None + content_type: str | None + error: str | None = None + + @model_validator(mode='after') + def check_error_mutually_exclusive_with_content(self): + if self.error is None: + if self.content_type is None: + raise ValueError('Content type required if no error') + if self.status_code is None: + raise ValueError('Status code required if no error') + return self + + if self.content_type is not None: + raise ValueError('Content type mutually exclusive with error') + + return self diff --git a/src/external/url_request/request.py b/src/external/url_request/request.py new file mode 100644 index 00000000..40fc2dd6 --- /dev/null +++ b/src/external/url_request/request.py @@ -0,0 +1,91 @@ +"""Functions for making HTTP requests.""" +from http import HTTPStatus + +from aiohttp import ClientSession, ClientResponseError +from playwright.async_api import async_playwright +from tqdm.asyncio import tqdm + +from src.external.url_request.constants import HTML_CONTENT_TYPE +from src.external.url_request.dtos.request_resources import RequestResources + +from src.external.url_request.dtos.url_response import URLResponseInfo + + +async def execute_get( + session: ClientSession, + url: str +) -> URLResponseInfo: + try: + async with session.get(url, timeout=20) as response: + response.raise_for_status() + text = await response.text() + return URLResponseInfo( + success=True, + html=text, + content_type=response.headers.get("content-type"), + status=HTTPStatus(response.status) + ) + except ClientResponseError as e: + return URLResponseInfo(success=False, status=HTTPStatus(e.status), exception=str(e)) + + +async def get_response(session: ClientSession, url: str) -> URLResponseInfo: + try: + return await execute_get(session, url) + except Exception as e: + print(f"An error occurred while fetching {url}: {e}") + return URLResponseInfo(success=False, exception=str(e)) + + +async def make_simple_requests(urls: list[str]) -> list[URLResponseInfo]: + async with ClientSession() as session: + tasks = [get_response(session, url) for url in urls] + results = await tqdm.gather(*tasks) + return results + + +async def get_dynamic_html_content( + rr: RequestResources, + url: str +) -> URLResponseInfo | None: + # For HTML responses, attempt to load the page to check for dynamic html content + async with rr.semaphore: + page = await rr.browser.new_page() + try: + await page.goto(url) + await page.wait_for_load_state("networkidle") + html_content = await page.content() + return URLResponseInfo( + success=True, + html=html_content, + content_type=HTML_CONTENT_TYPE, + status=HTTPStatus.OK + ) + except Exception as e: + return URLResponseInfo(success=False, exception=str(e)) + finally: + await page.close() + + +async def fetch_and_render( + rr: RequestResources, + url: str +) -> URLResponseInfo | None: + simple_response = await get_response(rr.session, url) + if not simple_response.success: + return simple_response + + if simple_response.content_type != HTML_CONTENT_TYPE: + return simple_response + + return await get_dynamic_html_content(rr, url) + + +async def fetch_urls(urls: list[str]) -> list[URLResponseInfo]: + async with ClientSession() as session: + async with async_playwright() as playwright: + browser = await playwright.chromium.launch(headless=True) + request_resources = RequestResources(session=session, browser=browser) + tasks = [fetch_and_render(request_resources, url) for url in urls] + results = await tqdm.gather(*tasks) + return results diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 3eb18773..13327bfd 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -61,7 +61,8 @@ def id_column() -> sa.Column: sa.Integer(), primary_key=True, autoincrement=True, - nullable=False + nullable=False, + comment='The primary identifier for the row.' ) def created_at_column() -> sa.Column: @@ -70,7 +71,8 @@ def created_at_column() -> sa.Column: 'created_at', sa.DateTime(), server_default=sa.text('now()'), - nullable=False + nullable=False, + comment='The time the row was created.' ) def updated_at_column() -> sa.Column: @@ -80,7 +82,8 @@ def updated_at_column() -> sa.Column: sa.DateTime(), server_default=sa.text('now()'), server_onupdate=sa.text('now()'), - nullable=False + nullable=False, + comment='The last time the row was updated.' ) def url_id_column() -> sa.Column: @@ -91,7 +94,8 @@ def url_id_column() -> sa.Column: 'urls.id', ondelete='CASCADE' ), - nullable=False + nullable=False, + comment='A foreign key to the `urls` table.' ) def batch_id_column(nullable=False) -> sa.Column: @@ -102,5 +106,6 @@ def batch_id_column(nullable=False) -> sa.Column: 'batches.id', ondelete='CASCADE' ), - nullable=nullable + nullable=nullable, + comment='A foreign key to the `batches` table.' ) \ No newline at end of file diff --git a/start_mirrored_local_app.py b/start_mirrored_local_app.py index 5199fba2..9190fece 100644 --- a/start_mirrored_local_app.py +++ b/start_mirrored_local_app.py @@ -27,15 +27,8 @@ def main(): # Check cache if exists and checker = TimestampChecker() data_dump_container = docker_manager.run_container(data_dumper_docker_info) - if checker.last_run_within_24_hours(): - print("Last run within 24 hours, skipping dump...") - else: - data_dump_container.run_command( - DUMP_SH_DOCKER_PATH, - ) - data_dump_container.run_command( - RESTORE_SH_DOCKER_PATH, - ) + _run_dump_if_longer_than_24_hours(checker, data_dump_container) + _run_database_restore(data_dump_container) print("Stopping datadumper container") data_dump_container.stop() checker.set_last_run_time() @@ -44,6 +37,10 @@ def main(): apply_migrations() # Run `fastapi dev main.py` + _run_fast_api(docker_manager) + + +def _run_fast_api(docker_manager: DockerManager) -> None: try: uvicorn.run( "src.api.main:app", @@ -59,8 +56,22 @@ def main(): print("Containers stopped.") +def _run_database_restore(data_dump_container) -> None: + data_dump_container.run_command( + RESTORE_SH_DOCKER_PATH, + ) +def _run_dump_if_longer_than_24_hours( + checker: TimestampChecker, + data_dump_container +) -> None: + if checker.last_run_within_24_hours(): + print("Last run within 24 hours, skipping dump...") + return + data_dump_container.run_command( + DUMP_SH_DOCKER_PATH, + ) if __name__ == "__main__": diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py index 690b83e4..78dd0f55 100644 --- a/tests/automated/integration/api/test_annotate.py +++ b/tests/automated/integration/api/test_annotate.py @@ -9,7 +9,7 @@ from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index 620e0318..f0bebaaf 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 9fd65eed..28a2483d 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -3,7 +3,7 @@ from src.core.enums import BatchStatus from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/db/structure/testers/table.py b/tests/automated/integration/db/structure/testers/table.py index aed5d3a5..a91c0837 100644 --- a/tests/automated/integration/db/structure/testers/table.py +++ b/tests/automated/integration/db/structure/testers/table.py @@ -7,7 +7,7 @@ from sqlalchemy.exc import DataError from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.templates import Base +from src.db.models.templates_.base import Base from tests.automated.integration.db.structure.testers.models.column import ColumnTester from tests.automated.integration.db.structure.types import ConstraintTester, SATypes diff --git a/tests/automated/integration/html_tag_collector/test_root_url_cache.py b/tests/automated/integration/html_tag_collector/test_root_url_cache.py index 151985cf..0add726e 100644 --- a/tests/automated/integration/html_tag_collector/test_root_url_cache.py +++ b/tests/automated/integration/html_tag_collector/test_root_url_cache.py @@ -1,7 +1,7 @@ import pytest -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo async def mock_get_request(url: str) -> RootURLCacheResponseInfo: diff --git a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py index dc0a3452..8e345d51 100644 --- a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.compressed_html import URLCompressedHTML +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html diff --git a/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py index 5cae5a26..7eb5a7f9 100644 --- a/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py +++ b/tests/automated/integration/tasks/url/agency_identification/happy_path/test_happy_path.py @@ -26,7 +26,6 @@ async def test_agency_identification_task( ): """Test full flow of AgencyIdentificationTaskOperator""" - # Confirm does not yet meet prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py index bd66e409..2f4e64b5 100644 --- a/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py +++ b/tests/automated/integration/tasks/url/duplicate/test_url_duplicate_task.py @@ -3,7 +3,7 @@ import pytest -from src.core.tasks.url.operators.url_duplicate.core import URLDuplicateTaskOperator +from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.instantiations.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/url/html/asserts.py b/tests/automated/integration/tasks/url/html/asserts.py deleted file mode 100644 index 9ca241cd..00000000 --- a/tests/automated/integration/tasks/url/html/asserts.py +++ /dev/null @@ -1,52 +0,0 @@ -from src.api.endpoints.task.by_id.dto import TaskInfo -from src.collectors.enums import URLStatus -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from tests.automated.integration.tasks.url.html.mocks.constants import MOCK_HTML_CONTENT - - -async def assert_success_url_has_two_html_content_entries( - adb: AsyncDatabaseClient, - run_info, - url_id: int -): - await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) - hci = await adb.get_html_content_info(url_id=url_id) - assert len(hci) == 2 - -async def assert_url_has_one_compressed_html_content_entry( - adb: AsyncDatabaseClient, - url_id: int -): - html = await adb.get_html_for_url(url_id=url_id) - assert html == MOCK_HTML_CONTENT - -async def assert_success_url_has_one_compressed_html_content_entry( - adb: AsyncDatabaseClient, - run_info, - url_id: int -): - await adb.link_urls_to_task(task_id=run_info.task_id, url_ids=run_info.linked_url_ids) - hci = await adb.get_html_content_info(url_id=url_id) - assert len(hci) == 1 - -async def assert_404_url_has_404_status( - adb: AsyncDatabaseClient, - url_id: int -): - url_info_404 = await adb.get_url_info_by_id(url_id=url_id) - assert url_info_404.outcome == URLStatus.NOT_FOUND - - -def assert_task_has_one_url_error(task_info): - assert len(task_info.url_errors) == 1 - assert task_info.url_errors[0].error == "test error" - - -def assert_task_type_is_html(task_info): - assert task_info.task_type == TaskType.HTML - - -def assert_html_task_ran_without_error(task_info: TaskInfo): - assert task_info.error_info is None diff --git a/tests/automated/integration/tasks/url/html/check/__init__.py b/tests/automated/integration/tasks/url/html/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/check/manager.py b/tests/automated/integration/tasks/url/html/check/manager.py new file mode 100644 index 00000000..71a48b42 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/check/manager.py @@ -0,0 +1,68 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.automated.integration.tasks.url.html.setup.models.record import TestURLHTMLTaskSetupRecord + + +class TestURLHTMLTaskCheckManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient, + records: list[TestURLHTMLTaskSetupRecord] + ): + self.adb_client = adb_client + self.records = records + self._id_to_entry = {record.url_id: record.entry for record in records} + + async def check(self): + await self._check_has_html() + await self._check_scrape_status() + await self._check_has_same_url_status() + await self._check_marked_as_404() + + async def _check_has_html(self) -> None: + urls_with_html = [ + record.url_id + for record in self.records + if record.entry.expected_result.has_html + ] + + compressed_html_list: list[URLCompressedHTML] = await self.adb_client.get_all(URLCompressedHTML) + assert len(compressed_html_list) == len(urls_with_html) + for compressed_html in compressed_html_list: + assert compressed_html.url_id in urls_with_html + + async def _check_scrape_status(self) -> None: + urls_with_scrape_status = [ + record.url_id + for record in self.records + if record.entry.expected_result.scrape_status is not None + ] + + url_scrape_info_list: list[URLScrapeInfo] = await self.adb_client.get_all(URLScrapeInfo) + assert len(url_scrape_info_list) == len(urls_with_scrape_status) + for url_scrape_info in url_scrape_info_list: + assert url_scrape_info.url_id in urls_with_scrape_status + entry = self._id_to_entry[url_scrape_info.url_id] + expected_scrape_status = entry.expected_result.scrape_status + assert url_scrape_info.status == expected_scrape_status + + async def _check_has_same_url_status(self): + urls: list[URL] = await self.adb_client.get_all(URL) + for url in urls: + entry = self._id_to_entry[url.id] + if entry.expected_result.web_metadata_status_marked_404: + continue + assert url.outcome == entry.url_info.status, f"URL {url.url} has outcome {url.outcome} instead of {entry.url_info.status}" + + async def _check_marked_as_404(self): + web_metadata_list: list[URLWebMetadata] = await self.adb_client.get_all( + URLWebMetadata + ) + for web_metadata in web_metadata_list: + entry = self._id_to_entry[web_metadata.url_id] + if entry.expected_result.web_metadata_status_marked_404: + assert web_metadata.status_code == 404, f"URL {entry.url_info.url} has status code {web_metadata.status_code} instead of 404" diff --git a/tests/automated/integration/tasks/url/html/mocks/constants.py b/tests/automated/integration/tasks/url/html/mocks/constants.py deleted file mode 100644 index 0b60341d..00000000 --- a/tests/automated/integration/tasks/url/html/mocks/constants.py +++ /dev/null @@ -1,3 +0,0 @@ - -MOCK_HTML_CONTENT = "" -MOCK_CONTENT_TYPE = "text/html" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/mocks/methods.py b/tests/automated/integration/tasks/url/html/mocks/methods.py index dd623ee8..d6799eea 100644 --- a/tests/automated/integration/tasks/url/html/mocks/methods.py +++ b/tests/automated/integration/tasks/url/html/mocks/methods.py @@ -1,55 +1,9 @@ -from http import HTTPStatus from typing import Optional -from aiohttp import ClientResponseError, RequestInfo - -from src.core.tasks.url.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo -from tests.automated.integration.tasks.url.html.mocks.constants import MOCK_CONTENT_TYPE, MOCK_HTML_CONTENT - - -async def mock_make_requests(self, urls: list[str]) -> list[URLResponseInfo]: - results = [] - for idx, url in enumerate(urls): - # Second result should produce a 404 - if idx == 1: - results.append( - URLResponseInfo( - success=False, - content_type=MOCK_CONTENT_TYPE, - exception=str(ClientResponseError( - request_info=RequestInfo( - url=url, - method="GET", - real_url=url, - headers={}, - ), - code=HTTPStatus.NOT_FOUND.value, - history=(None,), - )), - status=HTTPStatus.NOT_FOUND - ) - ) - continue - - if idx == 2: - # 3rd result should produce an error - results.append( - URLResponseInfo( - success=False, - exception=str(ValueError("test error")), - content_type=MOCK_CONTENT_TYPE - )) - else: - # All other results should succeed - results.append(URLResponseInfo( - html=MOCK_HTML_CONTENT, success=True, content_type=MOCK_CONTENT_TYPE)) - return results +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo async def mock_parse(self, url: str, html_content: str, content_type: str) -> ResponseHTMLInfo: - assert html_content == MOCK_HTML_CONTENT - assert content_type == MOCK_CONTENT_TYPE return ResponseHTMLInfo( url=url, title="fake title", diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/__init__.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py new file mode 100644 index 00000000..a8dde5b5 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/core.py @@ -0,0 +1,11 @@ +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.html.mocks.url_request_interface.setup import setup_url_to_response_info + + +class MockURLRequestInterface: + + def __init__(self): + self._url_to_response_info: dict[str, URLResponseInfo] = setup_url_to_response_info() + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + return [self._url_to_response_info[url] for url in urls] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py new file mode 100644 index 00000000..cff46013 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/mocks/url_request_interface/setup.py @@ -0,0 +1,45 @@ +from http import HTTPStatus + +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.html.setup.data import TEST_ENTRIES +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestErrorType +from tests.helpers.simple_test_data_functions import generate_test_html + + +def _get_success( + entry: TestURLHTMLTaskSetupEntry +) -> bool: + if entry.give_error is not None: + return False + return True + +def get_http_status( + entry: TestURLHTMLTaskSetupEntry +) -> HTTPStatus: + if entry.give_error is None: + return HTTPStatus.OK + if entry.give_error == TestErrorType.HTTP_404: + return HTTPStatus.NOT_FOUND + return HTTPStatus.INTERNAL_SERVER_ERROR + +def _get_content_type( + entry: TestURLHTMLTaskSetupEntry +) -> str | None: + if entry.give_error is not None: + return None + return "text/html" + + +def setup_url_to_response_info( +) -> dict[str, URLResponseInfo]: + d = {} + for entry in TEST_ENTRIES: + response_info = URLResponseInfo( + success=_get_success(entry), + status=get_http_status(entry), + html=generate_test_html() if _get_success(entry) else None, + content_type=_get_content_type(entry), + exception=None if _get_success(entry) else "Error" + ) + d[entry.url_info.url] = response_info + return d diff --git a/tests/automated/integration/tasks/url/html/setup.py b/tests/automated/integration/tasks/url/html/setup.py deleted file mode 100644 index e6a4de81..00000000 --- a/tests/automated/integration/tasks/url/html/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import types - -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser - -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache -from src.db.client.async_ import AsyncDatabaseClient -from tests.automated.integration.tasks.url.html.mocks.methods import mock_make_requests, mock_get_from_cache, mock_parse - - -async def setup_mocked_url_request_interface() -> URLRequestInterface: - url_request_interface = URLRequestInterface() - url_request_interface.make_requests_with_html = types.MethodType(mock_make_requests, url_request_interface) - return url_request_interface - - -async def setup_mocked_root_url_cache() -> RootURLCache: - mock_root_url_cache = RootURLCache() - mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) - return mock_root_url_cache - - -async def setup_urls(db_data_creator) -> list[int]: - batch_id = db_data_creator.batch() - url_mappings = db_data_creator.urls(batch_id=batch_id, url_count=3).url_mappings - url_ids = [url_info.url_id for url_info in url_mappings] - return url_ids - - -async def setup_operator() -> URLHTMLTaskOperator: - html_parser = HTMLResponseParser( - root_url_cache=await setup_mocked_root_url_cache() - ) - html_parser.parse = types.MethodType(mock_parse, html_parser) - operator = URLHTMLTaskOperator( - adb_client=AsyncDatabaseClient(), - url_request_interface=await setup_mocked_url_request_interface(), - html_parser=html_parser - ) - return operator diff --git a/tests/automated/integration/tasks/url/html/setup/__init__.py b/tests/automated/integration/tasks/url/html/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/setup/data.py b/tests/automated/integration/tasks/url/html/setup/data.py new file mode 100644 index 00000000..9c488484 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/data.py @@ -0,0 +1,94 @@ +from http import HTTPStatus + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestURLInfo, \ + TestWebMetadataInfo, ExpectedResult, TestErrorType + +TEST_ENTRIES = [ + # URLs that give 200s should be updated with the appropriate scrape status + # and their html should be stored + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://happy-path.com/pending", + status=URLStatus.PENDING + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + expected_result=ExpectedResult( + has_html=True, # Test for both compressed HTML and content metadata + scrape_status=ScrapeStatus.SUCCESS + ) + ), + # URLs that give 404s should be updated with the appropriate scrape status + # and their web metadata status should be updated to 404 + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://not-found-path.com/submitted", + status=URLStatus.ERROR + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + give_error=TestErrorType.HTTP_404, + expected_result=ExpectedResult( + has_html=False, + scrape_status=ScrapeStatus.ERROR, + web_metadata_status_marked_404=True + ) + ), + # URLs that give errors should be updated with the appropriate scrape status + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://error-path.com/submitted", + status=URLStatus.ERROR + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.OK, + error_message=None + ), + give_error=TestErrorType.SCRAPER, + expected_result=ExpectedResult( + has_html=False, + scrape_status=ScrapeStatus.ERROR + ) + ), + # URLs with non-200 web metadata should not be processed + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://not-200-path.com/submitted", + status=URLStatus.PENDING + ), + web_metadata_info=TestWebMetadataInfo( + accessed=True, + content_type="text/html", + response_code=HTTPStatus.PERMANENT_REDIRECT, + error_message=None + ), + expected_result=ExpectedResult( + has_html=False, + scrape_status=None + ) + ), + # URLs with no web metadata should not be processed + TestURLHTMLTaskSetupEntry( + url_info=TestURLInfo( + url="https://no-web-metadata.com/submitted", + status=URLStatus.PENDING + ), + web_metadata_info=None, + expected_result=ExpectedResult( + has_html=False, + scrape_status=None + ) + ) +] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/setup/manager.py b/tests/automated/integration/tasks/url/html/setup/manager.py new file mode 100644 index 00000000..8e679a57 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/manager.py @@ -0,0 +1,87 @@ +import types + +from src.core.enums import RecordType +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from tests.automated.integration.tasks.url.html.mocks.methods import mock_get_from_cache, mock_parse +from tests.automated.integration.tasks.url.html.mocks.url_request_interface.core import MockURLRequestInterface +from tests.automated.integration.tasks.url.html.setup.data import TEST_ENTRIES +from tests.automated.integration.tasks.url.html.setup.models.record import TestURLHTMLTaskSetupRecord + + +class TestURLHTMLTaskSetupManager: + + def __init__(self, adb_client: AsyncDatabaseClient): + self.adb_client = adb_client + + + async def setup(self) -> list[TestURLHTMLTaskSetupRecord]: + + records = await self._setup_urls() + await self.setup_web_metadata(records) + return records + + async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: + url_insert_models: list[URLInsertModel] = [] + for entry in TEST_ENTRIES: + url_insert_model = URLInsertModel( + outcome=entry.url_info.status, + url=entry.url_info.url, + name=f"Test for {entry.url_info.url}", + record_type=RecordType.RESOURCES + ) + url_insert_models.append(url_insert_model) + url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) + + records = [] + for url_id, entry in zip(url_ids, TEST_ENTRIES): + record = TestURLHTMLTaskSetupRecord( + url_id=url_id, + entry=entry + ) + records.append(record) + return records + + async def setup_web_metadata( + self, + records: list[TestURLHTMLTaskSetupRecord] + ) -> None: + models = [] + for record in records: + entry = record.entry + web_metadata_info = entry.web_metadata_info + if web_metadata_info is None: + continue + model = URLWebMetadataPydantic( + url_id=record.url_id, + accessed=web_metadata_info.accessed, + status_code=web_metadata_info.response_code.value, + content_type=web_metadata_info.content_type, + error_message=web_metadata_info.error_message + ) + models.append(model) + await self.adb_client.bulk_insert(models) + + + +async def setup_mocked_root_url_cache() -> RootURLCache: + mock_root_url_cache = RootURLCache() + mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) + return mock_root_url_cache + + +async def setup_operator() -> URLHTMLTaskOperator: + html_parser = HTMLResponseParser( + root_url_cache=await setup_mocked_root_url_cache() + ) + html_parser.parse = types.MethodType(mock_parse, html_parser) + operator = URLHTMLTaskOperator( + adb_client=AsyncDatabaseClient(), + url_request_interface=MockURLRequestInterface(), + html_parser=html_parser + ) + return operator diff --git a/tests/automated/integration/tasks/url/html/setup/models/__init__.py b/tests/automated/integration/tasks/url/html/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/html/setup/models/entry.py b/tests/automated/integration/tasks/url/html/setup/models/entry.py new file mode 100644 index 00000000..8cc2a8ad --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/models/entry.py @@ -0,0 +1,34 @@ +from enum import Enum +from http import HTTPStatus + +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus + + +class TestErrorType(Enum): + SCRAPER = "scraper" + HTTP_404 = "http-404" + + +class TestWebMetadataInfo(BaseModel): + accessed: bool + content_type: str | None + response_code: HTTPStatus + error_message: str | None + +class TestURLInfo(BaseModel): + url: str + status: URLStatus + +class ExpectedResult(BaseModel): + has_html: bool + scrape_status: ScrapeStatus | None # Does not have scrape info if none + web_metadata_status_marked_404: bool = False + +class TestURLHTMLTaskSetupEntry(BaseModel): + url_info: TestURLInfo + web_metadata_info: TestWebMetadataInfo | None + give_error: TestErrorType | None = None + expected_result: ExpectedResult \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/setup/models/record.py b/tests/automated/integration/tasks/url/html/setup/models/record.py new file mode 100644 index 00000000..7902dd81 --- /dev/null +++ b/tests/automated/integration/tasks/url/html/setup/models/record.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from tests.automated.integration.tasks.url.html.setup.models.entry import TestURLHTMLTaskSetupEntry + + +class TestURLHTMLTaskSetupRecord(BaseModel): + url_id: int + entry: TestURLHTMLTaskSetupEntry \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/html/test_task.py b/tests/automated/integration/tasks/url/html/test_task.py index 2592713f..fe059838 100644 --- a/tests/automated/integration/tasks/url/html/test_task.py +++ b/tests/automated/integration/tasks/url/html/test_task.py @@ -1,41 +1,34 @@ import pytest +from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from tests.automated.integration.tasks.url.html.asserts import assert_success_url_has_two_html_content_entries, assert_404_url_has_404_status, assert_task_has_one_url_error, \ - assert_task_type_is_html, assert_html_task_ran_without_error, assert_url_has_one_compressed_html_content_entry -from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_url_task_has_expected_run_info -from tests.automated.integration.tasks.url.html.setup import setup_urls, setup_operator -from tests.helpers.data_creator.core import DBDataCreator +from tests.automated.integration.tasks.asserts import assert_prereqs_not_met, assert_prereqs_met, \ + assert_task_ran_without_error +from tests.automated.integration.tasks.url.html.check.manager import TestURLHTMLTaskCheckManager +from tests.automated.integration.tasks.url.html.setup.manager import setup_operator, \ + TestURLHTMLTaskSetupManager @pytest.mark.asyncio -async def test_url_html_task(db_data_creator: DBDataCreator): +async def test_url_html_task(adb_client_test: AsyncDatabaseClient): + setup = TestURLHTMLTaskSetupManager(adb_client_test) operator = await setup_operator() # No URLs were created, the prereqs should not be met await assert_prereqs_not_met(operator) - url_ids = await setup_urls(db_data_creator) - success_url_id = url_ids[0] - not_found_url_id = url_ids[1] + records = await setup.setup() + await assert_prereqs_met(operator) - task_id = await db_data_creator.adb_client.initiate_task(task_type=TaskType.HTML) + task_id = await adb_client_test.initiate_task(task_type=TaskType.HTML) run_info = await operator.run_task(task_id) - assert_url_task_has_expected_run_info(run_info, url_ids) + assert_task_ran_without_error(run_info) - - task_info = await db_data_creator.adb_client.get_task_info( - task_id=operator.task_id + checker = TestURLHTMLTaskCheckManager( + adb_client=adb_client_test, + records=records ) + await checker.check() - assert_html_task_ran_without_error(task_info) - assert_task_type_is_html(task_info) - assert_task_has_one_url_error(task_info) - - adb = db_data_creator.adb_client - await assert_success_url_has_two_html_content_entries(adb, run_info, success_url_id) - await assert_url_has_one_compressed_html_content_entry(adb, success_url_id) - await assert_404_url_has_404_status(adb, not_found_url_id) - - + await assert_prereqs_not_met(operator) diff --git a/tests/automated/integration/tasks/url/probe/__init__.py b/tests/automated/integration/tasks/url/probe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/conftest.py b/tests/automated/integration/tasks/url/probe/conftest.py new file mode 100644 index 00000000..b8836a4b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/conftest.py @@ -0,0 +1,15 @@ +import pytest_asyncio + +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from src.external.url_request.core import URLRequestInterface +from tests.automated.integration.tasks.url.probe.constants import PATCH_ROOT +from tests.automated.integration.tasks.url.probe.setup.mocks.probe_manager import MockURLProbeManager + + +@pytest_asyncio.fixture +async def operator(adb_client_test, monkeypatch): + monkeypatch.setattr(PATCH_ROOT, MockURLProbeManager) + yield URLProbeTaskOperator( + adb_client=adb_client_test, + url_request_interface=URLRequestInterface() + ) diff --git a/tests/automated/integration/tasks/url/probe/constants.py b/tests/automated/integration/tasks/url/probe/constants.py new file mode 100644 index 00000000..6bc307e5 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/constants.py @@ -0,0 +1,3 @@ + + +PATCH_ROOT = "src.external.url_request.core.URLProbeManager" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/__init__.py b/tests/automated/integration/tasks/url/probe/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/core.py b/tests/automated/integration/tasks/url/probe/setup/core.py new file mode 100644 index 00000000..1884798b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/core.py @@ -0,0 +1,22 @@ +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES + + +async def create_urls_in_db( + adb_client: AsyncDatabaseClient, +) -> None: + record_types = [rt for rt in RecordType] + urls = [] + for idx, entry in enumerate(SETUP_ENTRIES): + url = URLInsertModel( + url=entry.url, + outcome=entry.url_status, + name=f"test-url-probe-task-url-{idx}", + record_type=record_types[idx] + ) + urls.append(url) + await adb_client.bulk_insert(urls) + diff --git a/tests/automated/integration/tasks/url/probe/setup/data.py b/tests/automated/integration/tasks/url/probe/setup/data.py new file mode 100644 index 00000000..85ad2547 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/data.py @@ -0,0 +1,36 @@ +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry +from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse + +SETUP_ENTRIES: list[TestURLProbeTaskEntry] = [ + TestURLProbeTaskEntry( + url="https://pending.com", + url_status=URLStatus.PENDING, + url_probe_response=URLProbePlannedResponse( + status_code=200, + content_type="text/html", + error=None + ), + expected_accessed=True + ), + TestURLProbeTaskEntry( + url="https://submitted.com", + url_status=URLStatus.SUBMITTED, + url_probe_response=URLProbePlannedResponse( + status_code=500, + content_type=None, + error="test error" + ), + expected_accessed=True + ), + TestURLProbeTaskEntry( + url="https://failure.com", + url_status=URLStatus.ERROR, + url_probe_response=URLProbePlannedResponse( + status_code=None, + content_type=None, + error="URL not found" + ), + expected_accessed=False + ) +] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/format.py b/tests/automated/integration/tasks/url/probe/setup/format.py new file mode 100644 index 00000000..8cb2fdb0 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/format.py @@ -0,0 +1,24 @@ +from src.external.url_request.probe.model import URLProbeResponse +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry + + +def build_url_to_probe_response_map( +) -> dict[str, URLProbeResponse]: + d = {} + for entry in SETUP_ENTRIES: + probe_response = URLProbeResponse( + url=entry.url, + status_code=entry.url_probe_response.status_code, + content_type=entry.url_probe_response.content_type, + error=entry.url_probe_response.error + ) + d[entry.url] = probe_response + return d + +def build_url_to_entry_map( +) -> dict[str, TestURLProbeTaskEntry]: + d = {} + for entry in SETUP_ENTRIES: + d[entry.url] = entry + return d \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py b/tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py b/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py new file mode 100644 index 00000000..ac65ea9b --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py @@ -0,0 +1,20 @@ +from aiohttp import ClientSession + +from src.external.url_request.probe.model import URLProbeResponse +from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_probe_response_map + + +class MockURLProbeManager: + + def __init__( + self, + session: ClientSession + ): + self.session = session + self._url_to_probe_response: dict[str, URLProbeResponse] = build_url_to_probe_response_map() + + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: + return [ + self._url_to_probe_response[url] + for url in urls + ] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/models/__init__.py b/tests/automated/integration/tasks/url/probe/setup/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/models/entry.py b/tests/automated/integration/tasks/url/probe/setup/models/entry.py new file mode 100644 index 00000000..6432de9c --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/models/entry.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse + + +class TestURLProbeTaskEntry(BaseModel): + url: str + url_status: URLStatus + url_probe_response: URLProbePlannedResponse + expected_accessed: bool diff --git a/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py b/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py new file mode 100644 index 00000000..41f17883 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class URLProbePlannedResponse(BaseModel): + status_code: int | None + content_type: str | None + error: str | None \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/__init__.py b/tests/automated/integration/tasks/url/probe/setup/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/check.py b/tests/automated/integration/tasks/url/probe/setup/queries/check.py new file mode 100644 index 00000000..988efffc --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/queries/check.py @@ -0,0 +1,43 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES +from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_entry_map +from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry + + +class CheckURLsInDBForURLProbeTaskQueryBuilder(QueryBuilderBase): + + def __init__(self): + super().__init__() + self._entries = SETUP_ENTRIES + self._url_to_entry_map: dict[ + str, TestURLProbeTaskEntry + ] = build_url_to_entry_map() + + async def run(self, session: AsyncSession) -> None: + + query = ( + select( + URL.url, + URLWebMetadata.accessed, + URLWebMetadata.status_code, + URLWebMetadata.content_type, + URLWebMetadata.error_message + ) + .join(URLWebMetadata, URL.id == URLWebMetadata.url_id) + ) + mappings = await sh.mappings(session, query=query) + assert len(mappings) == len(self._entries) + for mapping in mappings: + url = mapping["url"] + entry = self._url_to_entry_map[url] + assert entry.expected_accessed == mapping["accessed"] + assert entry.url_probe_response.status_code == mapping["status_code"] + assert entry.url_probe_response.content_type == mapping["content_type"] + assert entry.url_probe_response.error == mapping["error_message"] + diff --git a/tests/automated/integration/tasks/url/probe/test_core.py b/tests/automated/integration/tasks/url/probe/test_core.py new file mode 100644 index 00000000..ee3fe50c --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/test_core.py @@ -0,0 +1,33 @@ +import pytest + +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.setup.core import create_urls_in_db +from tests.automated.integration.tasks.url.probe.setup.queries.check import CheckURLsInDBForURLProbeTaskQueryBuilder + + +@pytest.mark.asyncio +async def test_url_probe_task( + operator: URLProbeTaskOperator +): + adb_client = operator.adb_client + # Check task does not yet meet pre-requisites + assert not await operator.meets_task_prerequisites() + + # Set up URLs + await create_urls_in_db(adb_client=adb_client) + + # Check task meets pre-requisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task no longer meets pre-requisites + assert not await operator.meets_task_prerequisites() + + # Check results as expected + await adb_client.run_query_builder( + CheckURLsInDBForURLProbeTaskQueryBuilder() + ) diff --git a/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py index 8e27908b..ce9861e0 100644 --- a/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/submit_approved/test_submit_approved_url_task.py @@ -1,7 +1,7 @@ import pytest from deepdiff import DeepDiff -from src.core.tasks.url.operators.submit_approved_url.core import SubmitApprovedURLTaskOperator +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 54592640..2022a8f3 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -5,13 +5,13 @@ import pytest from aiohttp import ClientResponseError, RequestInfo -from src.core.tasks.url.operators.url_404_probe.core import URL404ProbeTaskOperator -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface +from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator +from src.external.url_request.core import URLRequestInterface from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.url_html.scraper.request_interface.dtos.url_response import URLResponseInfo +from src.external.url_request.dtos.url_response import URLResponseInfo from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters diff --git a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py index ed7f1336..6e95fccb 100644 --- a/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/test_url_miscellaneous_metadata_task.py @@ -2,7 +2,7 @@ import pytest -from src.core.tasks.url.operators.url_miscellaneous_metadata.core import URLMiscellaneousMetadataTaskOperator +from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 2cc91449..20ddc362 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -5,9 +5,9 @@ from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector +from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 94c3fde6..622da31b 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -4,9 +4,9 @@ from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector +from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index 672936e0..a8afe591 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -6,11 +6,11 @@ from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO from src.collectors.source_collectors.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.db.models.instantiations.url.core.pydantic.info import URLInfo PATCH_ROOT = "src.collectors.source_collectors.muckrock" diff --git a/tests/conftest.py b/tests/conftest.py index f26249cd..3d9cebc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,7 +127,7 @@ def db_data_creator( db_data_creator = DBDataCreator(db_client=db_client_test) yield db_data_creator -@pytest.fixture +@pytest_asyncio.fixture async def test_client_session() -> AsyncGenerator[ClientSession, Any]: async with ClientSession() as session: yield session diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index 6c9e95e3..dd947d65 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,5 +1,8 @@ -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.instantiations.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer @@ -16,6 +19,7 @@ def __init__( async def run(self) -> None: html_content_infos = [] raw_html_info_list = [] + scraper_info_list = [] for url_id in self.url_ids: html_content_infos.append( URLHTMLContentInfo( @@ -36,6 +40,11 @@ async def run(self) -> None: html="" ) raw_html_info_list.append(raw_html_info) + scraper_info = URLScrapeInfoInsertModel( + url_id=url_id, + status=ScrapeStatus.SUCCESS, + ) + scraper_info_list.append(scraper_info) await self.adb_client.add_raw_html(raw_html_info_list) await self.adb_client.add_html_content_infos(html_content_infos) diff --git a/tests/helpers/data_creator/commands/impl/url_metadata.py b/tests/helpers/data_creator/commands/impl/url_metadata.py new file mode 100644 index 00000000..9d3cf4ff --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/url_metadata.py @@ -0,0 +1,31 @@ +from http import HTTPStatus + +from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase + + +class URLMetadataCommand(DBDataCreatorCommandBase): + + def __init__( + self, + url_ids: list[int], + content_type: str = "text/html", + status_code: int = HTTPStatus.OK.value + ): + super().__init__() + self.url_ids = url_ids + self.content_type = content_type + self.status_code = status_code + + async def run(self) -> None: + url_metadata_infos = [] + for url_id in self.url_ids: + url_metadata = URLWebMetadataPydantic( + url_id=url_id, + accessed=True, + status_code=self.status_code, + content_type=self.content_type, + error_message=None + ) + url_metadata_infos.append(url_metadata) + await self.adb_client.bulk_insert(url_metadata_infos) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls.py index daec2445..e4602dee 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls.py @@ -1,9 +1,9 @@ from datetime import datetime from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo +from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.simple_test_data_functions import generate_test_urls diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index f86e9a25..fed9c970 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -1,33 +1,22 @@ -from collections import defaultdict from datetime import datetime -from random import randint -from typing import List, Optional, Any +from http import HTTPStatus +from typing import Optional, Any from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.api.endpoints.review.enums import RejectionReason from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.dtos.url.html_content import URLHTMLContentInfo, HTMLContentType -from src.db.models.instantiations.url.core.pydantic import URLInfo from src.db.client.sync import DatabaseClient -from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType, URLStatus -from src.core.tasks.url.operators.submit_approved_url.tdo import SubmittedURLInfo -from src.core.tasks.url.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus -from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand -from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.batch import DBDataCreatorBatchCommand from tests.helpers.data_creator.commands.impl.batch_v2 import BatchV2Command from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand @@ -38,14 +27,13 @@ from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.user.record_type import UserRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.relevant import UserRelevantSuggestionCommand +from tests.helpers.data_creator.commands.impl.url_metadata import URLMetadataCommand from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 -from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo -from tests.helpers.simple_test_data_functions import generate_test_urls class DBDataCreator: @@ -366,3 +354,17 @@ async def agency_user_suggestions( agency_annotation_info=agency_annotation_info ) ) + + async def url_metadata( + self, + url_ids: list[int], + content_type: str = "text/html", + status_code: int = HTTPStatus.OK.value + ) -> None: + await self.run_command( + URLMetadataCommand( + url_ids=url_ids, + content_type=content_type, + status_code=status_code + ) + ) diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 2145bcf1..630d0f71 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -1,6 +1,6 @@ from sqlalchemy import create_engine -from src.db.models.templates import Base +from src.db.models.templates_.base import Base def wipe_database(connection_string: str) -> None: diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index d5f2c313..df455e0e 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -12,3 +12,17 @@ def generate_test_urls(count: int) -> list[str]: results.append(url) return results + +def generate_test_html() -> str: + return """ + + +
+This is an example of HTML content.
+ + + """ \ No newline at end of file diff --git a/tests/manual/core/tasks/test_url_html_task_operator.py b/tests/manual/core/tasks/test_url_html_task_operator.py index f4cc36d6..b6031d77 100644 --- a/tests/manual/core/tasks/test_url_html_task_operator.py +++ b/tests/manual/core/tasks/test_url_html_task_operator.py @@ -1,12 +1,10 @@ -from unittest.mock import patch - import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO, ManualBatchInnerInputDTO -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache @pytest.mark.asyncio diff --git a/tests/manual/external/url_request/__init__.py b/tests/manual/external/url_request/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/external/url_request/test_url_probe.py b/tests/manual/external/url_request/test_url_probe.py new file mode 100644 index 00000000..d13d0f80 --- /dev/null +++ b/tests/manual/external/url_request/test_url_probe.py @@ -0,0 +1,22 @@ +import pytest + +from src.external.url_request.probe.core import URLProbeManager + +URLS = [ + "https://www.google.com", + "https://www.example.com", + "https://www.example.org", + "https://www.nonexistent.com", +] + +@pytest.mark.asyncio +async def test_url_probe_head(test_client_session): + manager = URLProbeManager(session=test_client_session) + result = await manager.head(url=URLS[0]) + print(result) + +@pytest.mark.asyncio +async def test_url_probe(test_client_session): + manager = URLProbeManager(session=test_client_session) + results = await manager.probe_urls(urls=URLS) + print(results) \ No newline at end of file diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index ef8f0df3..d7942b4a 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -1,11 +1,11 @@ import pytest -from src.core.tasks.url.operators.url_html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.url_html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.url_html.scraper.request_interface.core import URLRequestInterface -from src.core.tasks.url.operators.url_html.scraper.root_url_cache.core import RootURLCache +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.external.url_request.core import URLRequestInterface +from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic import URLInfo +from src.db.models.instantiations.url.core.pydantic_.info import URLInfo from tests.helpers.data_creator.core import DBDataCreator URLS = [ diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index 612e7425..f3050d7b 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_deepseek_record_classifier(): - from src.db.dtos.url.html_content import HTMLContentType as hct + from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", diff --git a/tests/manual/llm_api_logic/test_openai_record_classifier.py b/tests/manual/llm_api_logic/test_openai_record_classifier.py index 7f3cb67e..b0105437 100644 --- a/tests/manual/llm_api_logic/test_openai_record_classifier.py +++ b/tests/manual/llm_api_logic/test_openai_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_openai_record_classifier(): - from src.db.dtos.url.html_content import HTMLContentType as hct + from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA",